Skip to content

Commit 055c2a7

Browse files
Do not longer remove surrogates (#1804)
UTF8 characters can only be encoded using surrogates. By replacing them we lose most UTF8 emojis and such. If we ever encound actual unpaired surrogates on code we need a different fix.
1 parent b823397 commit 055c2a7

File tree

1 file changed

+2
-21
lines changed

1 file changed

+2
-21
lines changed

Diff for: codepropertygraph/src/main/scala/io/shiftleft/utils/IOUtils.scala

+2-21
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,6 @@ import scala.util.Using
1010

1111
object IOUtils {
1212

13-
private val surrogatePattern: Pattern = Pattern.compile("[^\u0000-\uffff]")
14-
1513
private val boms: Set[Char] = Set(
1614
'\uefbb', // UTF-8
1715
'\ufeff', // UTF-16 (BE)
@@ -39,27 +37,10 @@ object IOUtils {
3937
}
4038
}
4139

42-
/** Java strings are stored as sequences of 16-bit chars, but what they represent is sequences of unicode characters.
43-
* In unicode terminology, they are stored as code units, but model code points. Thus, it's somewhat meaningless to
44-
* talk about removing surrogates, which don't exist in the character / code point representation (unless you have
45-
* rogue single surrogates, in which case you have other problems). Rather, what you want to do is to remove any
46-
* characters which will require surrogates when encoded. That means any character which lies beyond the basic
47-
* multilingual plane. You can do that with a simple regular expression.
48-
*/
49-
private def replaceUnpairedSurrogates(input: String): String = {
50-
val matches = surrogatePattern.matcher(input)
51-
if (matches.find()) {
52-
val size = matches.end() - matches.start()
53-
matches.replaceAll("?" * size)
54-
} else {
55-
input
56-
}
57-
}
58-
5940
private def contentFromBufferedSource(bufferedSource: BufferedSource): Seq[String] = {
6041
val reader = bufferedSource.bufferedReader()
6142
skipBOMIfPresent(reader)
62-
reader.lines().iterator().asScala.map(replaceUnpairedSurrogates).toSeq
43+
reader.lines().iterator().asScala.toSeq
6344
}
6445

6546
private def contentStringFromBufferedSource(bufferedSource: BufferedSource): String = {
@@ -78,7 +59,7 @@ object IOUtils {
7859
}
7960
}
8061

81-
replaceUnpairedSurrogates(stringBuilder.toString)
62+
stringBuilder.toString
8263
}
8364

8465
/** Reads a file at the given path and:

0 commit comments

Comments
 (0)