Do not longer remove surrogates (#1804)

max-leuthaeuser · web-flow · commit 055c2a7fd210 · 2025-01-13T16:15:03.000+01:00
UTF8 characters can only be encoded using surrogates.
By replacing them we lose most UTF8 emojis and such.
If we ever encound actual unpaired surrogates on code we need a different fix.
diff --git a/codepropertygraph/src/main/scala/io/shiftleft/utils/IOUtils.scala b/codepropertygraph/src/main/scala/io/shiftleft/utils/IOUtils.scala
@@ -10,8 +10,6 @@ import scala.util.Using
 
 object IOUtils {
 
-  private val surrogatePattern: Pattern = Pattern.compile("[^\u0000-\uffff]")
-
   private val boms: Set[Char] = Set(
     '\uefbb', // UTF-8
     '\ufeff', // UTF-16 (BE)
@@ -39,27 +37,10 @@ object IOUtils {
     }
   }
 
-  /** Java strings are stored as sequences of 16-bit chars, but what they represent is sequences of unicode characters.
-    * In unicode terminology, they are stored as code units, but model code points. Thus, it's somewhat meaningless to
-    * talk about removing surrogates, which don't exist in the character / code point representation (unless you have
-    * rogue single surrogates, in which case you have other problems). Rather, what you want to do is to remove any
-    * characters which will require surrogates when encoded. That means any character which lies beyond the basic
-    * multilingual plane. You can do that with a simple regular expression.
-    */
-  private def replaceUnpairedSurrogates(input: String): String = {
-    val matches = surrogatePattern.matcher(input)
-    if (matches.find()) {
-      val size = matches.end() - matches.start()
-      matches.replaceAll("?" * size)
-    } else {
-      input
-    }
-  }
-
   private def contentFromBufferedSource(bufferedSource: BufferedSource): Seq[String] = {
     val reader = bufferedSource.bufferedReader()
     skipBOMIfPresent(reader)
-    reader.lines().iterator().asScala.map(replaceUnpairedSurrogates).toSeq
+    reader.lines().iterator().asScala.toSeq
   }
 
   private def contentStringFromBufferedSource(bufferedSource: BufferedSource): String = {
@@ -78,7 +59,7 @@ object IOUtils {
       }
     }
 
-    replaceUnpairedSurrogates(stringBuilder.toString)
+    stringBuilder.toString
   }
 
   /** Reads a file at the given path and: