Skip to content

Commit c3cec78

Browse files
author
Victor Stinner
committed
Add asciilib: similar to ucs1, ucs2 and ucs4 library, but specialized to ASCII
ucs1, ucs2 and ucs4 libraries have to scan created substring to find the maximum character, whereas it is not need to ASCII strings. Because ASCII strings are common, it is useful to optimize ASCII.
1 parent 14f8f02 commit c3cec78

File tree

4 files changed

+153
-49
lines changed

4 files changed

+153
-49
lines changed

Include/unicodeobject.h

+1
Original file line numberDiff line numberDiff line change
@@ -1851,6 +1851,7 @@ PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGroupingLocale(Py_UNICODE *buff
18511851
see Objects/stringlib/localeutil.h */
18521852
#ifndef Py_LIMITED_API
18531853
PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGrouping(
1854+
PyObject *unicode,
18541855
int kind,
18551856
void *buffer,
18561857
Py_ssize_t n_buffer,

Objects/stringlib/asciilib.h

+34
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
/* this is sort of a hack. there's at least one place (formatting
2+
floats) where some stringlib code takes a different path if it's
3+
compiled as unicode. */
4+
#define STRINGLIB_IS_UNICODE 1
5+
6+
#define FASTSEARCH asciilib_fastsearch
7+
#define STRINGLIB(F) asciilib_##F
8+
#define STRINGLIB_OBJECT PyUnicodeObject
9+
#define STRINGLIB_CHAR Py_UCS1
10+
#define STRINGLIB_TYPE_NAME "unicode"
11+
#define STRINGLIB_PARSE_CODE "U"
12+
#define STRINGLIB_EMPTY unicode_empty
13+
#define STRINGLIB_ISSPACE Py_UNICODE_ISSPACE
14+
#define STRINGLIB_ISLINEBREAK BLOOM_LINEBREAK
15+
#define STRINGLIB_ISDECIMAL Py_UNICODE_ISDECIMAL
16+
#define STRINGLIB_TODECIMAL Py_UNICODE_TODECIMAL
17+
#define STRINGLIB_TOUPPER Py_UNICODE_TOUPPER
18+
#define STRINGLIB_TOLOWER Py_UNICODE_TOLOWER
19+
#define STRINGLIB_FILL Py_UNICODE_FILL
20+
#define STRINGLIB_STR PyUnicode_1BYTE_DATA
21+
#define STRINGLIB_LEN PyUnicode_GET_LENGTH
22+
#define STRINGLIB_NEW unicode_fromascii
23+
#define STRINGLIB_RESIZE not_supported
24+
#define STRINGLIB_CHECK PyUnicode_Check
25+
#define STRINGLIB_CHECK_EXACT PyUnicode_CheckExact
26+
#define STRINGLIB_GROUPING _PyUnicode_InsertThousandsGrouping
27+
#define STRINGLIB_GROUPING_LOCALE _PyUnicode_InsertThousandsGroupingLocale
28+
29+
#define STRINGLIB_TOSTR PyObject_Str
30+
#define STRINGLIB_TOASCII PyObject_ASCII
31+
32+
#define _Py_InsertThousandsGrouping _PyUnicode_ascii_InsertThousandsGrouping
33+
#define _Py_InsertThousandsGroupingLocale _PyUnicode_ascii_InsertThousandsGroupingLocale
34+

Objects/unicodeobject.c

+116-47
Original file line numberDiff line numberDiff line change
@@ -8331,6 +8331,15 @@ PyUnicode_EncodeDecimal(Py_UNICODE *s,
83318331

83328332
/* --- Helpers ------------------------------------------------------------ */
83338333

8334+
#include "stringlib/asciilib.h"
8335+
#include "stringlib/fastsearch.h"
8336+
#include "stringlib/partition.h"
8337+
#include "stringlib/split.h"
8338+
#include "stringlib/count.h"
8339+
#include "stringlib/find.h"
8340+
#include "stringlib/localeutil.h"
8341+
#include "stringlib/undef.h"
8342+
83348343
#include "stringlib/ucs1lib.h"
83358344
#include "stringlib/fastsearch.h"
83368345
#include "stringlib/partition.h"
@@ -8359,7 +8368,10 @@ PyUnicode_EncodeDecimal(Py_UNICODE *s,
83598368
#include "stringlib/undef.h"
83608369

83618370
static Py_ssize_t
8362-
any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t,
8371+
any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ascii)(const Py_UCS1*, Py_ssize_t,
8372+
const Py_UCS1*, Py_ssize_t,
8373+
Py_ssize_t, Py_ssize_t),
8374+
Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t,
83638375
const Py_UCS1*, Py_ssize_t,
83648376
Py_ssize_t, Py_ssize_t),
83658377
Py_ssize_t Py_LOCAL_CALLBACK(ucs2)(const Py_UCS2*, Py_ssize_t,
@@ -8396,7 +8408,10 @@ any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t,
83968408

83978409
switch(kind) {
83988410
case PyUnicode_1BYTE_KIND:
8399-
result = ucs1(buf1, len1, buf2, len2, start, end);
8411+
if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8412+
result = ascii(buf1, len1, buf2, len2, start, end);
8413+
else
8414+
result = ucs1(buf1, len1, buf2, len2, start, end);
84008415
break;
84018416
case PyUnicode_2BYTE_KIND:
84028417
result = ucs2(buf1, len1, buf2, len2, start, end);
@@ -8417,7 +8432,7 @@ any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t,
84178432
}
84188433

84198434
Py_ssize_t
8420-
_PyUnicode_InsertThousandsGrouping(int kind, void *data,
8435+
_PyUnicode_InsertThousandsGrouping(PyObject *unicode, int kind, void *data,
84218436
Py_ssize_t n_buffer,
84228437
void *digits, Py_ssize_t n_digits,
84238438
Py_ssize_t min_width,
@@ -8426,9 +8441,14 @@ _PyUnicode_InsertThousandsGrouping(int kind, void *data,
84268441
{
84278442
switch(kind) {
84288443
case PyUnicode_1BYTE_KIND:
8429-
return _PyUnicode_ucs1_InsertThousandsGrouping(
8430-
(Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8431-
min_width, grouping, thousands_sep);
8444+
if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
8445+
return _PyUnicode_ascii_InsertThousandsGrouping(
8446+
(Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8447+
min_width, grouping, thousands_sep);
8448+
else
8449+
return _PyUnicode_ucs1_InsertThousandsGrouping(
8450+
(Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8451+
min_width, grouping, thousands_sep);
84328452
case PyUnicode_2BYTE_KIND:
84338453
return _PyUnicode_ucs2_InsertThousandsGrouping(
84348454
(Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
@@ -8505,10 +8525,16 @@ PyUnicode_Count(PyObject *str,
85058525
ADJUST_INDICES(start, end, len1);
85068526
switch(kind) {
85078527
case PyUnicode_1BYTE_KIND:
8508-
result = ucs1lib_count(
8509-
((Py_UCS1*)buf1) + start, end - start,
8510-
buf2, len2, PY_SSIZE_T_MAX
8511-
);
8528+
if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
8529+
result = asciilib_count(
8530+
((Py_UCS1*)buf1) + start, end - start,
8531+
buf2, len2, PY_SSIZE_T_MAX
8532+
);
8533+
else
8534+
result = ucs1lib_count(
8535+
((Py_UCS1*)buf1) + start, end - start,
8536+
buf2, len2, PY_SSIZE_T_MAX
8537+
);
85128538
break;
85138539
case PyUnicode_2BYTE_KIND:
85148540
result = ucs2lib_count(
@@ -8565,12 +8591,14 @@ PyUnicode_Find(PyObject *str,
85658591

85668592
if (direction > 0)
85678593
result = any_find_slice(
8568-
ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
8594+
asciilib_find_slice, ucs1lib_find_slice,
8595+
ucs2lib_find_slice, ucs4lib_find_slice,
85698596
str, sub, start, end
85708597
);
85718598
else
85728599
result = any_find_slice(
8573-
ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
8600+
asciilib_find_slice, ucs1lib_rfind_slice,
8601+
ucs2lib_rfind_slice, ucs4lib_rfind_slice,
85748602
str, sub, start, end
85758603
);
85768604

@@ -9200,9 +9228,14 @@ PyUnicode_Splitlines(PyObject *string, int keepends)
92009228

92019229
switch(PyUnicode_KIND(string)) {
92029230
case PyUnicode_1BYTE_KIND:
9203-
list = ucs1lib_splitlines(
9204-
(PyObject*) string, PyUnicode_1BYTE_DATA(string),
9205-
PyUnicode_GET_LENGTH(string), keepends);
9231+
if (PyUnicode_IS_ASCII(string))
9232+
list = asciilib_splitlines(
9233+
(PyObject*) string, PyUnicode_1BYTE_DATA(string),
9234+
PyUnicode_GET_LENGTH(string), keepends);
9235+
else
9236+
list = ucs1lib_splitlines(
9237+
(PyObject*) string, PyUnicode_1BYTE_DATA(string),
9238+
PyUnicode_GET_LENGTH(string), keepends);
92069239
break;
92079240
case PyUnicode_2BYTE_KIND:
92089241
list = ucs2lib_splitlines(
@@ -9241,10 +9274,16 @@ split(PyObject *self,
92419274
if (substring == NULL)
92429275
switch(PyUnicode_KIND(self)) {
92439276
case PyUnicode_1BYTE_KIND:
9244-
return ucs1lib_split_whitespace(
9245-
(PyObject*) self, PyUnicode_1BYTE_DATA(self),
9246-
PyUnicode_GET_LENGTH(self), maxcount
9247-
);
9277+
if (PyUnicode_IS_ASCII(self))
9278+
return asciilib_split_whitespace(
9279+
(PyObject*) self, PyUnicode_1BYTE_DATA(self),
9280+
PyUnicode_GET_LENGTH(self), maxcount
9281+
);
9282+
else
9283+
return ucs1lib_split_whitespace(
9284+
(PyObject*) self, PyUnicode_1BYTE_DATA(self),
9285+
PyUnicode_GET_LENGTH(self), maxcount
9286+
);
92489287
case PyUnicode_2BYTE_KIND:
92499288
return ucs2lib_split_whitespace(
92509289
(PyObject*) self, PyUnicode_2BYTE_DATA(self),
@@ -9283,8 +9322,12 @@ split(PyObject *self,
92839322

92849323
switch(kind) {
92859324
case PyUnicode_1BYTE_KIND:
9286-
out = ucs1lib_split(
9287-
(PyObject*) self, buf1, len1, buf2, len2, maxcount);
9325+
if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9326+
out = asciilib_split(
9327+
(PyObject*) self, buf1, len1, buf2, len2, maxcount);
9328+
else
9329+
out = ucs1lib_split(
9330+
(PyObject*) self, buf1, len1, buf2, len2, maxcount);
92889331
break;
92899332
case PyUnicode_2BYTE_KIND:
92909333
out = ucs2lib_split(
@@ -9323,10 +9366,16 @@ rsplit(PyObject *self,
93239366
if (substring == NULL)
93249367
switch(PyUnicode_KIND(self)) {
93259368
case PyUnicode_1BYTE_KIND:
9326-
return ucs1lib_rsplit_whitespace(
9327-
(PyObject*) self, PyUnicode_1BYTE_DATA(self),
9328-
PyUnicode_GET_LENGTH(self), maxcount
9329-
);
9369+
if (PyUnicode_IS_ASCII(self))
9370+
return asciilib_rsplit_whitespace(
9371+
(PyObject*) self, PyUnicode_1BYTE_DATA(self),
9372+
PyUnicode_GET_LENGTH(self), maxcount
9373+
);
9374+
else
9375+
return ucs1lib_rsplit_whitespace(
9376+
(PyObject*) self, PyUnicode_1BYTE_DATA(self),
9377+
PyUnicode_GET_LENGTH(self), maxcount
9378+
);
93309379
case PyUnicode_2BYTE_KIND:
93319380
return ucs2lib_rsplit_whitespace(
93329381
(PyObject*) self, PyUnicode_2BYTE_DATA(self),
@@ -9365,8 +9414,12 @@ rsplit(PyObject *self,
93659414

93669415
switch(kind) {
93679416
case PyUnicode_1BYTE_KIND:
9368-
out = ucs1lib_rsplit(
9369-
(PyObject*) self, buf1, len1, buf2, len2, maxcount);
9417+
if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9418+
out = asciilib_rsplit(
9419+
(PyObject*) self, buf1, len1, buf2, len2, maxcount);
9420+
else
9421+
out = ucs1lib_rsplit(
9422+
(PyObject*) self, buf1, len1, buf2, len2, maxcount);
93709423
break;
93719424
case PyUnicode_2BYTE_KIND:
93729425
out = ucs2lib_rsplit(
@@ -9387,12 +9440,15 @@ rsplit(PyObject *self,
93879440
}
93889441

93899442
static Py_ssize_t
9390-
anylib_find(int kind, void *buf1, Py_ssize_t len1,
9391-
void *buf2, Py_ssize_t len2, Py_ssize_t offset)
9443+
anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
9444+
PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
93929445
{
93939446
switch(kind) {
93949447
case PyUnicode_1BYTE_KIND:
9395-
return ucs1lib_find(buf1, len1, buf2, len2, offset);
9448+
if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
9449+
return asciilib_find(buf1, len1, buf2, len2, offset);
9450+
else
9451+
return ucs1lib_find(buf1, len1, buf2, len2, offset);
93969452
case PyUnicode_2BYTE_KIND:
93979453
return ucs2lib_find(buf1, len1, buf2, len2, offset);
93989454
case PyUnicode_4BYTE_KIND:
@@ -9403,12 +9459,15 @@ anylib_find(int kind, void *buf1, Py_ssize_t len1,
94039459
}
94049460

94059461
static Py_ssize_t
9406-
anylib_count(int kind, void* sbuf, Py_ssize_t slen,
9407-
void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
9462+
anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
9463+
PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
94089464
{
94099465
switch(kind) {
94109466
case PyUnicode_1BYTE_KIND:
9411-
return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
9467+
if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
9468+
return asciilib_count(sbuf, slen, buf1, len1, maxcount);
9469+
else
9470+
return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
94129471
case PyUnicode_2BYTE_KIND:
94139472
return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
94149473
case PyUnicode_4BYTE_KIND:
@@ -9497,7 +9556,7 @@ replace(PyObject *self, PyObject *str1,
94979556
if (!buf1) goto error;
94989557
release1 = 1;
94999558
}
9500-
i = anylib_find(rkind, sbuf, slen, buf1, len1, 0);
9559+
i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
95019560
if (i < 0)
95029561
goto nothing;
95039562
if (rkind > kind2) {
@@ -9530,9 +9589,9 @@ replace(PyObject *self, PyObject *str1,
95309589
i += len1;
95319590

95329591
while ( --maxcount > 0) {
9533-
i = anylib_find(rkind, sbuf+PyUnicode_KIND_SIZE(rkind, i),
9534-
slen-i,
9535-
buf1, len1, i);
9592+
i = anylib_find(rkind, self,
9593+
sbuf+PyUnicode_KIND_SIZE(rkind, i), slen-i,
9594+
str1, buf1, len1, i);
95369595
if (i == -1)
95379596
break;
95389597
memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
@@ -9557,7 +9616,7 @@ replace(PyObject *self, PyObject *str1,
95579616
if (!buf1) goto error;
95589617
release1 = 1;
95599618
}
9560-
n = anylib_count(rkind, sbuf, slen, buf1, len1, maxcount);
9619+
n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
95619620
if (n == 0)
95629621
goto nothing;
95639622
if (kind2 < rkind) {
@@ -9596,9 +9655,9 @@ replace(PyObject *self, PyObject *str1,
95969655
if (len1 > 0) {
95979656
while (n-- > 0) {
95989657
/* look for next match */
9599-
j = anylib_find(rkind,
9600-
sbuf + PyUnicode_KIND_SIZE(rkind, i),
9601-
slen-i, buf1, len1, i);
9658+
j = anylib_find(rkind, self,
9659+
sbuf + PyUnicode_KIND_SIZE(rkind, i), slen-i,
9660+
str1, buf1, len1, i);
96029661
if (j == -1)
96039662
break;
96049663
else if (j > i) {
@@ -10443,7 +10502,8 @@ unicode_find(PyObject *self, PyObject *args)
1044310502
return NULL;
1044410503

1044510504
result = any_find_slice(
10446-
ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
10505+
asciilib_find_slice, ucs1lib_find_slice,
10506+
ucs2lib_find_slice, ucs4lib_find_slice,
1044710507
self, (PyObject*)substring, start, end
1044810508
);
1044910509

@@ -10536,7 +10596,8 @@ unicode_index(PyObject *self, PyObject *args)
1053610596
return NULL;
1053710597

1053810598
result = any_find_slice(
10539-
ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
10599+
asciilib_find_slice, ucs1lib_find_slice,
10600+
ucs2lib_find_slice, ucs4lib_find_slice,
1054010601
self, (PyObject*)substring, start, end
1054110602
);
1054210603

@@ -11548,7 +11609,8 @@ unicode_rfind(PyObject *self, PyObject *args)
1154811609
return NULL;
1154911610

1155011611
result = any_find_slice(
11551-
ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
11612+
asciilib_rfind_slice, ucs1lib_rfind_slice,
11613+
ucs2lib_rfind_slice, ucs4lib_rfind_slice,
1155211614
self, (PyObject*)substring, start, end
1155311615
);
1155411616

@@ -11583,7 +11645,8 @@ unicode_rindex(PyObject *self, PyObject *args)
1158311645
return NULL;
1158411646

1158511647
result = any_find_slice(
11586-
ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
11648+
asciilib_rfind_slice, ucs1lib_rfind_slice,
11649+
ucs2lib_rfind_slice, ucs4lib_rfind_slice,
1158711650
self, (PyObject*)substring, start, end
1158811651
);
1158911652

@@ -11712,7 +11775,10 @@ PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
1171211775

1171311776
switch(PyUnicode_KIND(str_obj)) {
1171411777
case PyUnicode_1BYTE_KIND:
11715-
out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11778+
if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
11779+
out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11780+
else
11781+
out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
1171611782
break;
1171711783
case PyUnicode_2BYTE_KIND:
1171811784
out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
@@ -11781,7 +11847,10 @@ PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
1178111847

1178211848
switch(PyUnicode_KIND(str_in)) {
1178311849
case PyUnicode_1BYTE_KIND:
11784-
out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11850+
if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
11851+
out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11852+
else
11853+
out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
1178511854
break;
1178611855
case PyUnicode_2BYTE_KIND:
1178711856
out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);

0 commit comments

Comments
 (0)