Skip to content

Commit 3103627

Browse files
uros-dbcloud-fan
authored andcommitted
[SPARK-47410][SQL] Refactor UTF8String and CollationFactory
### What changes were proposed in this pull request? This PR introduces comprehensive support for collation-aware expressions in Spark, focusing on improving code structure, clarity, and testing coverage for various expressions (including: Contains, StartsWith, EndsWith). ### Why are the changes needed? The changes are essential to improve the maintainability and readability of collation-related code in Spark expressions. By restructuring and centralizing collation support through, we simplify the addition of new collation-aware operations and ensure consistent testing across different collation types. ### Does this PR introduce _any_ user-facing change? No, this PR is focused on internal refactoring and testing enhancements for collation-aware expression support. ### How was this patch tested? Unit tests in CollationSupportSuite.java E2E tests in CollationStringExpressionsSuite.scala ### Was this patch authored or co-authored using generative AI tooling? Yes. Closes #45978 from uros-db/SPARK-47410. Authored-by: Uros Bojanic <[email protected]> Signed-off-by: Wenchen Fan <[email protected]>
1 parent 6e371e1 commit 3103627

File tree

10 files changed

+874
-700
lines changed

10 files changed

+874
-700
lines changed

Diff for: common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java

+41-13
Original file line numberDiff line numberDiff line change
@@ -78,24 +78,36 @@ public static class Collation {
7878
*/
7979
public final boolean supportsBinaryOrdering;
8080

81+
/**
82+
* Support for Lowercase Equality implies that it is possible to check equality on
83+
* byte by byte level, but only after calling "UTF8String.toLowerCase" on both arguments.
84+
* This allows custom collation support for UTF8_BINARY_LCASE collation in various Spark
85+
* expressions, as this particular collation is not supported by the external ICU library.
86+
*/
87+
public final boolean supportsLowercaseEquality;
88+
8189
public Collation(
8290
String collationName,
8391
Collator collator,
8492
Comparator<UTF8String> comparator,
8593
String version,
8694
ToLongFunction<UTF8String> hashFunction,
8795
boolean supportsBinaryEquality,
88-
boolean supportsBinaryOrdering) {
96+
boolean supportsBinaryOrdering,
97+
boolean supportsLowercaseEquality) {
8998
this.collationName = collationName;
9099
this.collator = collator;
91100
this.comparator = comparator;
92101
this.version = version;
93102
this.hashFunction = hashFunction;
94103
this.supportsBinaryEquality = supportsBinaryEquality;
95104
this.supportsBinaryOrdering = supportsBinaryOrdering;
105+
this.supportsLowercaseEquality = supportsLowercaseEquality;
96106

97107
// De Morgan's Law to check supportsBinaryOrdering => supportsBinaryEquality
98108
assert(!supportsBinaryOrdering || supportsBinaryEquality);
109+
// No Collation can simultaneously support binary equality and lowercase equality
110+
assert(!supportsBinaryEquality || !supportsLowercaseEquality);
99111

100112
if (supportsBinaryEquality) {
101113
this.equalsFunction = UTF8String::equals;
@@ -112,15 +124,17 @@ public Collation(
112124
Collator collator,
113125
String version,
114126
boolean supportsBinaryEquality,
115-
boolean supportsBinaryOrdering) {
127+
boolean supportsBinaryOrdering,
128+
boolean supportsLowercaseEquality) {
116129
this(
117130
collationName,
118131
collator,
119132
(s1, s2) -> collator.compare(s1.toString(), s2.toString()),
120133
version,
121134
s -> (long)collator.getCollationKey(s.toString()).hashCode(),
122135
supportsBinaryEquality,
123-
supportsBinaryOrdering);
136+
supportsBinaryOrdering,
137+
supportsLowercaseEquality);
124138
}
125139
}
126140

@@ -141,7 +155,8 @@ public Collation(
141155
"1.0",
142156
s -> (long)s.hashCode(),
143157
true,
144-
true);
158+
true,
159+
false);
145160

146161
// Case-insensitive UTF8 binary collation.
147162
// TODO: Do in place comparisons instead of creating new strings.
@@ -152,17 +167,18 @@ public Collation(
152167
"1.0",
153168
(s) -> (long)s.toLowerCase().hashCode(),
154169
false,
155-
false);
170+
false,
171+
true);
156172

157173
// UNICODE case sensitive comparison (ROOT locale, in ICU).
158174
collationTable[2] = new Collation(
159-
"UNICODE", Collator.getInstance(ULocale.ROOT), "153.120.0.0", true, false);
175+
"UNICODE", Collator.getInstance(ULocale.ROOT), "153.120.0.0", true, false, false);
160176
collationTable[2].collator.setStrength(Collator.TERTIARY);
161177
collationTable[2].collator.freeze();
162178

163179
// UNICODE case-insensitive comparison (ROOT locale, in ICU + Secondary strength).
164180
collationTable[3] = new Collation(
165-
"UNICODE_CI", Collator.getInstance(ULocale.ROOT), "153.120.0.0", false, false);
181+
"UNICODE_CI", Collator.getInstance(ULocale.ROOT), "153.120.0.0", false, false, false);
166182
collationTable[3].collator.setStrength(Collator.SECONDARY);
167183
collationTable[3].collator.freeze();
168184

@@ -172,19 +188,31 @@ public Collation(
172188
}
173189

174190
/**
175-
* Auxiliary methods for collation aware string operations.
191+
* Returns a StringSearch object for the given pattern and target strings, under collation
192+
* rules corresponding to the given collationId. The external ICU library StringSearch object can
193+
* be used to find occurrences of the pattern in the target string, while respecting collation.
176194
*/
177-
178195
public static StringSearch getStringSearch(
179-
final UTF8String left,
180-
final UTF8String right,
196+
final UTF8String targetUTF8String,
197+
final UTF8String patternUTF8String,
181198
final int collationId) {
182-
String pattern = right.toString();
183-
CharacterIterator target = new StringCharacterIterator(left.toString());
199+
String pattern = patternUTF8String.toString();
200+
CharacterIterator target = new StringCharacterIterator(targetUTF8String.toString());
184201
Collator collator = CollationFactory.fetchCollation(collationId).collator;
185202
return new StringSearch(pattern, target, (RuleBasedCollator) collator);
186203
}
187204

205+
/**
206+
* Returns a collation-unaware StringSearch object for the given pattern and target strings.
207+
* While this object does not respect collation, it can be used to find occurrences of the pattern
208+
* in the target string for UTF8_BINARY or UTF8_BINARY_LCASE (if arguments are lowercased).
209+
*/
210+
public static StringSearch getStringSearch(
211+
final UTF8String targetUTF8String,
212+
final UTF8String patternUTF8String) {
213+
return new StringSearch(patternUTF8String.toString(), targetUTF8String.toString());
214+
}
215+
188216
/**
189217
* Returns the collation id for the given collation name.
190218
*/
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,174 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
package org.apache.spark.sql.catalyst.util;
18+
19+
import com.ibm.icu.text.StringSearch;
20+
21+
import org.apache.spark.unsafe.types.UTF8String;
22+
23+
/**
24+
* Static entry point for collation-aware expressions (StringExpressions, RegexpExpressions, and
25+
* other expressions that require custom collation support), as well as private utility methods for
26+
* collation-aware UTF8String operations needed to implement .
27+
*/
28+
public final class CollationSupport {
29+
30+
/**
31+
* Collation-aware string expressions.
32+
*/
33+
34+
public static class Contains {
35+
public static boolean exec(final UTF8String l, final UTF8String r, final int collationId) {
36+
CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
37+
if (collation.supportsBinaryEquality) {
38+
return execBinary(l, r);
39+
} else if (collation.supportsLowercaseEquality) {
40+
return execLowercase(l, r);
41+
} else {
42+
return execICU(l, r, collationId);
43+
}
44+
}
45+
public static String genCode(final String l, final String r, final int collationId) {
46+
CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
47+
String expr = "CollationSupport.Contains.exec";
48+
if (collation.supportsBinaryEquality) {
49+
return String.format(expr + "Binary(%s, %s)", l, r);
50+
} else if (collation.supportsLowercaseEquality) {
51+
return String.format(expr + "Lowercase(%s, %s)", l, r);
52+
} else {
53+
return String.format(expr + "ICU(%s, %s, %d)", l, r, collationId);
54+
}
55+
}
56+
public static boolean execBinary(final UTF8String l, final UTF8String r) {
57+
return l.contains(r);
58+
}
59+
public static boolean execLowercase(final UTF8String l, final UTF8String r) {
60+
return l.toLowerCase().contains(r.toLowerCase());
61+
}
62+
public static boolean execICU(final UTF8String l, final UTF8String r,
63+
final int collationId) {
64+
if (r.numBytes() == 0) return true;
65+
if (l.numBytes() == 0) return false;
66+
StringSearch stringSearch = CollationFactory.getStringSearch(l, r, collationId);
67+
return stringSearch.first() != StringSearch.DONE;
68+
}
69+
}
70+
71+
public static class StartsWith {
72+
public static boolean exec(final UTF8String l, final UTF8String r,
73+
final int collationId) {
74+
CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
75+
if (collation.supportsBinaryEquality) {
76+
return execBinary(l, r);
77+
} else if (collation.supportsLowercaseEquality) {
78+
return execLowercase(l, r);
79+
} else {
80+
return execICU(l, r, collationId);
81+
}
82+
}
83+
public static String genCode(final String l, final String r, final int collationId) {
84+
CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
85+
String expr = "CollationSupport.StartsWith.exec";
86+
if (collation.supportsBinaryEquality) {
87+
return String.format(expr + "Binary(%s, %s)", l, r);
88+
} else if (collation.supportsLowercaseEquality) {
89+
return String.format(expr + "Lowercase(%s, %s)", l, r);
90+
} else {
91+
return String.format(expr + "ICU(%s, %s, %d)", l, r, collationId);
92+
}
93+
}
94+
public static boolean execBinary(final UTF8String l, final UTF8String r) {
95+
return l.startsWith(r);
96+
}
97+
public static boolean execLowercase(final UTF8String l, final UTF8String r) {
98+
return l.toLowerCase().startsWith(r.toLowerCase());
99+
}
100+
public static boolean execICU(final UTF8String l, final UTF8String r,
101+
final int collationId) {
102+
return CollationAwareUTF8String.matchAt(l, r, 0, collationId);
103+
}
104+
}
105+
106+
public static class EndsWith {
107+
public static boolean exec(final UTF8String l, final UTF8String r, final int collationId) {
108+
CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
109+
if (collation.supportsBinaryEquality) {
110+
return execBinary(l, r);
111+
} else if (collation.supportsLowercaseEquality) {
112+
return execLowercase(l, r);
113+
} else {
114+
return execICU(l, r, collationId);
115+
}
116+
}
117+
public static String genCode(final String l, final String r, final int collationId) {
118+
CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
119+
String expr = "CollationSupport.EndsWith.exec";
120+
if (collation.supportsBinaryEquality) {
121+
return String.format(expr + "Binary(%s, %s)", l, r);
122+
} else if (collation.supportsLowercaseEquality) {
123+
return String.format(expr + "Lowercase(%s, %s)", l, r);
124+
} else {
125+
return String.format(expr + "ICU(%s, %s, %d)", l, r, collationId);
126+
}
127+
}
128+
public static boolean execBinary(final UTF8String l, final UTF8String r) {
129+
return l.endsWith(r);
130+
}
131+
public static boolean execLowercase(final UTF8String l, final UTF8String r) {
132+
return l.toLowerCase().endsWith(r.toLowerCase());
133+
}
134+
public static boolean execICU(final UTF8String l, final UTF8String r,
135+
final int collationId) {
136+
return CollationAwareUTF8String.matchAt(l, r, l.numBytes() - r.numBytes(), collationId);
137+
}
138+
}
139+
140+
// TODO: Add more collation-aware string expressions.
141+
142+
/**
143+
* Collation-aware regexp expressions.
144+
*/
145+
146+
// TODO: Add more collation-aware regexp expressions.
147+
148+
/**
149+
* Other collation-aware expressions.
150+
*/
151+
152+
// TODO: Add other collation-aware expressions.
153+
154+
/**
155+
* Utility class for collation-aware UTF8String operations.
156+
*/
157+
158+
private static class CollationAwareUTF8String {
159+
160+
private static boolean matchAt(final UTF8String target, final UTF8String pattern,
161+
final int pos, final int collationId) {
162+
if (pattern.numChars() + pos > target.numChars() || pos < 0) {
163+
return false;
164+
}
165+
if (pattern.numBytes() == 0 || target.numBytes() == 0) {
166+
return pattern.numBytes() == 0;
167+
}
168+
return CollationFactory.getStringSearch(target.substring(
169+
pos, pos + pattern.numChars()), pattern, collationId).last() == 0;
170+
}
171+
172+
}
173+
174+
}

Diff for: common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java

-54
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@
3030
import com.esotericsoftware.kryo.io.Input;
3131
import com.esotericsoftware.kryo.io.Output;
3232

33-
import com.ibm.icu.text.StringSearch;
3433
import org.apache.spark.sql.catalyst.util.CollationFactory;
3534
import org.apache.spark.unsafe.Platform;
3635
import org.apache.spark.unsafe.UTF8StringBuilder;
@@ -342,28 +341,6 @@ public boolean contains(final UTF8String substring) {
342341
return false;
343342
}
344343

345-
public boolean contains(final UTF8String substring, int collationId) {
346-
if (CollationFactory.fetchCollation(collationId).supportsBinaryEquality) {
347-
return this.contains(substring);
348-
}
349-
if (collationId == CollationFactory.UTF8_BINARY_LCASE_COLLATION_ID) {
350-
return this.toLowerCase().contains(substring.toLowerCase());
351-
}
352-
return collatedContains(substring, collationId);
353-
}
354-
355-
private boolean collatedContains(final UTF8String substring, int collationId) {
356-
if (substring.numBytes == 0) return true;
357-
if (this.numBytes == 0) return false;
358-
StringSearch stringSearch = CollationFactory.getStringSearch(this, substring, collationId);
359-
while (stringSearch.next() != StringSearch.DONE) {
360-
if (stringSearch.getMatchLength() == stringSearch.getPattern().length()) {
361-
return true;
362-
}
363-
}
364-
return false;
365-
}
366-
367344
/**
368345
* Returns the byte at position `i`.
369346
*/
@@ -378,45 +355,14 @@ public boolean matchAt(final UTF8String s, int pos) {
378355
return ByteArrayMethods.arrayEquals(base, offset + pos, s.base, s.offset, s.numBytes);
379356
}
380357

381-
private boolean matchAt(final UTF8String s, int pos, int collationId) {
382-
if (s.numChars() + pos > this.numChars() || pos < 0) {
383-
return false;
384-
}
385-
if (s.numBytes == 0 || this.numBytes == 0) {
386-
return s.numBytes == 0;
387-
}
388-
return CollationFactory.getStringSearch(this.substring(pos, pos + s.numChars()),
389-
s, collationId).last() == 0;
390-
}
391-
392358
public boolean startsWith(final UTF8String prefix) {
393359
return matchAt(prefix, 0);
394360
}
395361

396-
public boolean startsWith(final UTF8String prefix, int collationId) {
397-
if (CollationFactory.fetchCollation(collationId).supportsBinaryEquality) {
398-
return this.startsWith(prefix);
399-
}
400-
if (collationId == CollationFactory.UTF8_BINARY_LCASE_COLLATION_ID) {
401-
return this.toLowerCase().startsWith(prefix.toLowerCase());
402-
}
403-
return matchAt(prefix, 0, collationId);
404-
}
405-
406362
public boolean endsWith(final UTF8String suffix) {
407363
return matchAt(suffix, numBytes - suffix.numBytes);
408364
}
409365

410-
public boolean endsWith(final UTF8String suffix, int collationId) {
411-
if (CollationFactory.fetchCollation(collationId).supportsBinaryEquality) {
412-
return this.endsWith(suffix);
413-
}
414-
if (collationId == CollationFactory.UTF8_BINARY_LCASE_COLLATION_ID) {
415-
return this.toLowerCase().endsWith(suffix.toLowerCase());
416-
}
417-
return matchAt(suffix, numBytes - suffix.numBytes, collationId);
418-
}
419-
420366
/**
421367
* Returns the upper case of this string
422368
*/

0 commit comments

Comments
 (0)