@@ -10,8 +10,6 @@ import scala.util.Using
10
10
11
11
object IOUtils {
12
12
13
- private val surrogatePattern : Pattern = Pattern .compile(" [^\u0000 -\uffff ]" )
14
-
15
13
private val boms : Set [Char ] = Set (
16
14
'\uefbb ' , // UTF-8
17
15
'\ufeff ' , // UTF-16 (BE)
@@ -39,27 +37,10 @@ object IOUtils {
39
37
}
40
38
}
41
39
42
- /** Java strings are stored as sequences of 16-bit chars, but what they represent is sequences of unicode characters.
43
- * In unicode terminology, they are stored as code units, but model code points. Thus, it's somewhat meaningless to
44
- * talk about removing surrogates, which don't exist in the character / code point representation (unless you have
45
- * rogue single surrogates, in which case you have other problems). Rather, what you want to do is to remove any
46
- * characters which will require surrogates when encoded. That means any character which lies beyond the basic
47
- * multilingual plane. You can do that with a simple regular expression.
48
- */
49
- private def replaceUnpairedSurrogates (input : String ): String = {
50
- val matches = surrogatePattern.matcher(input)
51
- if (matches.find()) {
52
- val size = matches.end() - matches.start()
53
- matches.replaceAll(" ?" * size)
54
- } else {
55
- input
56
- }
57
- }
58
-
59
40
private def contentFromBufferedSource (bufferedSource : BufferedSource ): Seq [String ] = {
60
41
val reader = bufferedSource.bufferedReader()
61
42
skipBOMIfPresent(reader)
62
- reader.lines().iterator().asScala.map(replaceUnpairedSurrogates). toSeq
43
+ reader.lines().iterator().asScala.toSeq
63
44
}
64
45
65
46
private def contentStringFromBufferedSource (bufferedSource : BufferedSource ): String = {
@@ -78,7 +59,7 @@ object IOUtils {
78
59
}
79
60
}
80
61
81
- replaceUnpairedSurrogates( stringBuilder.toString)
62
+ stringBuilder.toString
82
63
}
83
64
84
65
/** Reads a file at the given path and:
0 commit comments