@@ -8331,6 +8331,15 @@ PyUnicode_EncodeDecimal(Py_UNICODE *s,
8331
8331
8332
8332
/* --- Helpers ------------------------------------------------------------ */
8333
8333
8334
+ #include "stringlib/asciilib.h"
8335
+ #include "stringlib/fastsearch.h"
8336
+ #include "stringlib/partition.h"
8337
+ #include "stringlib/split.h"
8338
+ #include "stringlib/count.h"
8339
+ #include "stringlib/find.h"
8340
+ #include "stringlib/localeutil.h"
8341
+ #include "stringlib/undef.h"
8342
+
8334
8343
#include "stringlib/ucs1lib.h"
8335
8344
#include "stringlib/fastsearch.h"
8336
8345
#include "stringlib/partition.h"
@@ -8359,7 +8368,10 @@ PyUnicode_EncodeDecimal(Py_UNICODE *s,
8359
8368
#include "stringlib/undef.h"
8360
8369
8361
8370
static Py_ssize_t
8362
- any_find_slice (Py_ssize_t Py_LOCAL_CALLBACK (ucs1 )(const Py_UCS1 * , Py_ssize_t ,
8371
+ any_find_slice (Py_ssize_t Py_LOCAL_CALLBACK (ascii )(const Py_UCS1 * , Py_ssize_t ,
8372
+ const Py_UCS1 * , Py_ssize_t ,
8373
+ Py_ssize_t , Py_ssize_t ),
8374
+ Py_ssize_t Py_LOCAL_CALLBACK (ucs1 )(const Py_UCS1 * , Py_ssize_t ,
8363
8375
const Py_UCS1 * , Py_ssize_t ,
8364
8376
Py_ssize_t , Py_ssize_t ),
8365
8377
Py_ssize_t Py_LOCAL_CALLBACK (ucs2 )(const Py_UCS2 * , Py_ssize_t ,
@@ -8396,7 +8408,10 @@ any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t,
8396
8408
8397
8409
switch (kind ) {
8398
8410
case PyUnicode_1BYTE_KIND :
8399
- result = ucs1 (buf1 , len1 , buf2 , len2 , start , end );
8411
+ if (PyUnicode_IS_ASCII (s1 ) && PyUnicode_IS_ASCII (s2 ))
8412
+ result = ascii (buf1 , len1 , buf2 , len2 , start , end );
8413
+ else
8414
+ result = ucs1 (buf1 , len1 , buf2 , len2 , start , end );
8400
8415
break ;
8401
8416
case PyUnicode_2BYTE_KIND :
8402
8417
result = ucs2 (buf1 , len1 , buf2 , len2 , start , end );
@@ -8417,7 +8432,7 @@ any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t,
8417
8432
}
8418
8433
8419
8434
Py_ssize_t
8420
- _PyUnicode_InsertThousandsGrouping (int kind , void * data ,
8435
+ _PyUnicode_InsertThousandsGrouping (PyObject * unicode , int kind , void * data ,
8421
8436
Py_ssize_t n_buffer ,
8422
8437
void * digits , Py_ssize_t n_digits ,
8423
8438
Py_ssize_t min_width ,
@@ -8426,9 +8441,14 @@ _PyUnicode_InsertThousandsGrouping(int kind, void *data,
8426
8441
{
8427
8442
switch (kind ) {
8428
8443
case PyUnicode_1BYTE_KIND :
8429
- return _PyUnicode_ucs1_InsertThousandsGrouping (
8430
- (Py_UCS1 * )data , n_buffer , (Py_UCS1 * )digits , n_digits ,
8431
- min_width , grouping , thousands_sep );
8444
+ if (unicode != NULL && PyUnicode_IS_ASCII (unicode ))
8445
+ return _PyUnicode_ascii_InsertThousandsGrouping (
8446
+ (Py_UCS1 * )data , n_buffer , (Py_UCS1 * )digits , n_digits ,
8447
+ min_width , grouping , thousands_sep );
8448
+ else
8449
+ return _PyUnicode_ucs1_InsertThousandsGrouping (
8450
+ (Py_UCS1 * )data , n_buffer , (Py_UCS1 * )digits , n_digits ,
8451
+ min_width , grouping , thousands_sep );
8432
8452
case PyUnicode_2BYTE_KIND :
8433
8453
return _PyUnicode_ucs2_InsertThousandsGrouping (
8434
8454
(Py_UCS2 * )data , n_buffer , (Py_UCS2 * )digits , n_digits ,
@@ -8505,10 +8525,16 @@ PyUnicode_Count(PyObject *str,
8505
8525
ADJUST_INDICES (start , end , len1 );
8506
8526
switch (kind ) {
8507
8527
case PyUnicode_1BYTE_KIND :
8508
- result = ucs1lib_count (
8509
- ((Py_UCS1 * )buf1 ) + start , end - start ,
8510
- buf2 , len2 , PY_SSIZE_T_MAX
8511
- );
8528
+ if (PyUnicode_IS_ASCII (str_obj ) && PyUnicode_IS_ASCII (sub_obj ))
8529
+ result = asciilib_count (
8530
+ ((Py_UCS1 * )buf1 ) + start , end - start ,
8531
+ buf2 , len2 , PY_SSIZE_T_MAX
8532
+ );
8533
+ else
8534
+ result = ucs1lib_count (
8535
+ ((Py_UCS1 * )buf1 ) + start , end - start ,
8536
+ buf2 , len2 , PY_SSIZE_T_MAX
8537
+ );
8512
8538
break ;
8513
8539
case PyUnicode_2BYTE_KIND :
8514
8540
result = ucs2lib_count (
@@ -8565,12 +8591,14 @@ PyUnicode_Find(PyObject *str,
8565
8591
8566
8592
if (direction > 0 )
8567
8593
result = any_find_slice (
8568
- ucs1lib_find_slice , ucs2lib_find_slice , ucs4lib_find_slice ,
8594
+ asciilib_find_slice , ucs1lib_find_slice ,
8595
+ ucs2lib_find_slice , ucs4lib_find_slice ,
8569
8596
str , sub , start , end
8570
8597
);
8571
8598
else
8572
8599
result = any_find_slice (
8573
- ucs1lib_rfind_slice , ucs2lib_rfind_slice , ucs4lib_rfind_slice ,
8600
+ asciilib_find_slice , ucs1lib_rfind_slice ,
8601
+ ucs2lib_rfind_slice , ucs4lib_rfind_slice ,
8574
8602
str , sub , start , end
8575
8603
);
8576
8604
@@ -9200,9 +9228,14 @@ PyUnicode_Splitlines(PyObject *string, int keepends)
9200
9228
9201
9229
switch (PyUnicode_KIND (string )) {
9202
9230
case PyUnicode_1BYTE_KIND :
9203
- list = ucs1lib_splitlines (
9204
- (PyObject * ) string , PyUnicode_1BYTE_DATA (string ),
9205
- PyUnicode_GET_LENGTH (string ), keepends );
9231
+ if (PyUnicode_IS_ASCII (string ))
9232
+ list = asciilib_splitlines (
9233
+ (PyObject * ) string , PyUnicode_1BYTE_DATA (string ),
9234
+ PyUnicode_GET_LENGTH (string ), keepends );
9235
+ else
9236
+ list = ucs1lib_splitlines (
9237
+ (PyObject * ) string , PyUnicode_1BYTE_DATA (string ),
9238
+ PyUnicode_GET_LENGTH (string ), keepends );
9206
9239
break ;
9207
9240
case PyUnicode_2BYTE_KIND :
9208
9241
list = ucs2lib_splitlines (
@@ -9241,10 +9274,16 @@ split(PyObject *self,
9241
9274
if (substring == NULL )
9242
9275
switch (PyUnicode_KIND (self )) {
9243
9276
case PyUnicode_1BYTE_KIND :
9244
- return ucs1lib_split_whitespace (
9245
- (PyObject * ) self , PyUnicode_1BYTE_DATA (self ),
9246
- PyUnicode_GET_LENGTH (self ), maxcount
9247
- );
9277
+ if (PyUnicode_IS_ASCII (self ))
9278
+ return asciilib_split_whitespace (
9279
+ (PyObject * ) self , PyUnicode_1BYTE_DATA (self ),
9280
+ PyUnicode_GET_LENGTH (self ), maxcount
9281
+ );
9282
+ else
9283
+ return ucs1lib_split_whitespace (
9284
+ (PyObject * ) self , PyUnicode_1BYTE_DATA (self ),
9285
+ PyUnicode_GET_LENGTH (self ), maxcount
9286
+ );
9248
9287
case PyUnicode_2BYTE_KIND :
9249
9288
return ucs2lib_split_whitespace (
9250
9289
(PyObject * ) self , PyUnicode_2BYTE_DATA (self ),
@@ -9283,8 +9322,12 @@ split(PyObject *self,
9283
9322
9284
9323
switch (kind ) {
9285
9324
case PyUnicode_1BYTE_KIND :
9286
- out = ucs1lib_split (
9287
- (PyObject * ) self , buf1 , len1 , buf2 , len2 , maxcount );
9325
+ if (PyUnicode_IS_ASCII (self ) && PyUnicode_IS_ASCII (substring ))
9326
+ out = asciilib_split (
9327
+ (PyObject * ) self , buf1 , len1 , buf2 , len2 , maxcount );
9328
+ else
9329
+ out = ucs1lib_split (
9330
+ (PyObject * ) self , buf1 , len1 , buf2 , len2 , maxcount );
9288
9331
break ;
9289
9332
case PyUnicode_2BYTE_KIND :
9290
9333
out = ucs2lib_split (
@@ -9323,10 +9366,16 @@ rsplit(PyObject *self,
9323
9366
if (substring == NULL )
9324
9367
switch (PyUnicode_KIND (self )) {
9325
9368
case PyUnicode_1BYTE_KIND :
9326
- return ucs1lib_rsplit_whitespace (
9327
- (PyObject * ) self , PyUnicode_1BYTE_DATA (self ),
9328
- PyUnicode_GET_LENGTH (self ), maxcount
9329
- );
9369
+ if (PyUnicode_IS_ASCII (self ))
9370
+ return asciilib_rsplit_whitespace (
9371
+ (PyObject * ) self , PyUnicode_1BYTE_DATA (self ),
9372
+ PyUnicode_GET_LENGTH (self ), maxcount
9373
+ );
9374
+ else
9375
+ return ucs1lib_rsplit_whitespace (
9376
+ (PyObject * ) self , PyUnicode_1BYTE_DATA (self ),
9377
+ PyUnicode_GET_LENGTH (self ), maxcount
9378
+ );
9330
9379
case PyUnicode_2BYTE_KIND :
9331
9380
return ucs2lib_rsplit_whitespace (
9332
9381
(PyObject * ) self , PyUnicode_2BYTE_DATA (self ),
@@ -9365,8 +9414,12 @@ rsplit(PyObject *self,
9365
9414
9366
9415
switch (kind ) {
9367
9416
case PyUnicode_1BYTE_KIND :
9368
- out = ucs1lib_rsplit (
9369
- (PyObject * ) self , buf1 , len1 , buf2 , len2 , maxcount );
9417
+ if (PyUnicode_IS_ASCII (self ) && PyUnicode_IS_ASCII (substring ))
9418
+ out = asciilib_rsplit (
9419
+ (PyObject * ) self , buf1 , len1 , buf2 , len2 , maxcount );
9420
+ else
9421
+ out = ucs1lib_rsplit (
9422
+ (PyObject * ) self , buf1 , len1 , buf2 , len2 , maxcount );
9370
9423
break ;
9371
9424
case PyUnicode_2BYTE_KIND :
9372
9425
out = ucs2lib_rsplit (
@@ -9387,12 +9440,15 @@ rsplit(PyObject *self,
9387
9440
}
9388
9441
9389
9442
static Py_ssize_t
9390
- anylib_find (int kind , void * buf1 , Py_ssize_t len1 ,
9391
- void * buf2 , Py_ssize_t len2 , Py_ssize_t offset )
9443
+ anylib_find (int kind , PyObject * str1 , void * buf1 , Py_ssize_t len1 ,
9444
+ PyObject * str2 , void * buf2 , Py_ssize_t len2 , Py_ssize_t offset )
9392
9445
{
9393
9446
switch (kind ) {
9394
9447
case PyUnicode_1BYTE_KIND :
9395
- return ucs1lib_find (buf1 , len1 , buf2 , len2 , offset );
9448
+ if (PyUnicode_IS_ASCII (str1 ) && PyUnicode_IS_ASCII (str2 ))
9449
+ return asciilib_find (buf1 , len1 , buf2 , len2 , offset );
9450
+ else
9451
+ return ucs1lib_find (buf1 , len1 , buf2 , len2 , offset );
9396
9452
case PyUnicode_2BYTE_KIND :
9397
9453
return ucs2lib_find (buf1 , len1 , buf2 , len2 , offset );
9398
9454
case PyUnicode_4BYTE_KIND :
@@ -9403,12 +9459,15 @@ anylib_find(int kind, void *buf1, Py_ssize_t len1,
9403
9459
}
9404
9460
9405
9461
static Py_ssize_t
9406
- anylib_count (int kind , void * sbuf , Py_ssize_t slen ,
9407
- void * buf1 , Py_ssize_t len1 , Py_ssize_t maxcount )
9462
+ anylib_count (int kind , PyObject * sstr , void * sbuf , Py_ssize_t slen ,
9463
+ PyObject * str1 , void * buf1 , Py_ssize_t len1 , Py_ssize_t maxcount )
9408
9464
{
9409
9465
switch (kind ) {
9410
9466
case PyUnicode_1BYTE_KIND :
9411
- return ucs1lib_count (sbuf , slen , buf1 , len1 , maxcount );
9467
+ if (PyUnicode_IS_ASCII (sstr ) && PyUnicode_IS_ASCII (str1 ))
9468
+ return asciilib_count (sbuf , slen , buf1 , len1 , maxcount );
9469
+ else
9470
+ return ucs1lib_count (sbuf , slen , buf1 , len1 , maxcount );
9412
9471
case PyUnicode_2BYTE_KIND :
9413
9472
return ucs2lib_count (sbuf , slen , buf1 , len1 , maxcount );
9414
9473
case PyUnicode_4BYTE_KIND :
@@ -9497,7 +9556,7 @@ replace(PyObject *self, PyObject *str1,
9497
9556
if (!buf1 ) goto error ;
9498
9557
release1 = 1 ;
9499
9558
}
9500
- i = anylib_find (rkind , sbuf , slen , buf1 , len1 , 0 );
9559
+ i = anylib_find (rkind , self , sbuf , slen , str1 , buf1 , len1 , 0 );
9501
9560
if (i < 0 )
9502
9561
goto nothing ;
9503
9562
if (rkind > kind2 ) {
@@ -9530,9 +9589,9 @@ replace(PyObject *self, PyObject *str1,
9530
9589
i += len1 ;
9531
9590
9532
9591
while ( -- maxcount > 0 ) {
9533
- i = anylib_find (rkind , sbuf + PyUnicode_KIND_SIZE ( rkind , i ) ,
9534
- slen - i ,
9535
- buf1 , len1 , i );
9592
+ i = anylib_find (rkind , self ,
9593
+ sbuf + PyUnicode_KIND_SIZE ( rkind , i ), slen - i ,
9594
+ str1 , buf1 , len1 , i );
9536
9595
if (i == -1 )
9537
9596
break ;
9538
9597
memcpy (res + PyUnicode_KIND_SIZE (rkind , i ),
@@ -9557,7 +9616,7 @@ replace(PyObject *self, PyObject *str1,
9557
9616
if (!buf1 ) goto error ;
9558
9617
release1 = 1 ;
9559
9618
}
9560
- n = anylib_count (rkind , sbuf , slen , buf1 , len1 , maxcount );
9619
+ n = anylib_count (rkind , self , sbuf , slen , str1 , buf1 , len1 , maxcount );
9561
9620
if (n == 0 )
9562
9621
goto nothing ;
9563
9622
if (kind2 < rkind ) {
@@ -9596,9 +9655,9 @@ replace(PyObject *self, PyObject *str1,
9596
9655
if (len1 > 0 ) {
9597
9656
while (n -- > 0 ) {
9598
9657
/* look for next match */
9599
- j = anylib_find (rkind ,
9600
- sbuf + PyUnicode_KIND_SIZE (rkind , i ),
9601
- slen - i , buf1 , len1 , i );
9658
+ j = anylib_find (rkind , self ,
9659
+ sbuf + PyUnicode_KIND_SIZE (rkind , i ), slen - i ,
9660
+ str1 , buf1 , len1 , i );
9602
9661
if (j == -1 )
9603
9662
break ;
9604
9663
else if (j > i ) {
@@ -10443,7 +10502,8 @@ unicode_find(PyObject *self, PyObject *args)
10443
10502
return NULL ;
10444
10503
10445
10504
result = any_find_slice (
10446
- ucs1lib_find_slice , ucs2lib_find_slice , ucs4lib_find_slice ,
10505
+ asciilib_find_slice , ucs1lib_find_slice ,
10506
+ ucs2lib_find_slice , ucs4lib_find_slice ,
10447
10507
self , (PyObject * )substring , start , end
10448
10508
);
10449
10509
@@ -10536,7 +10596,8 @@ unicode_index(PyObject *self, PyObject *args)
10536
10596
return NULL ;
10537
10597
10538
10598
result = any_find_slice (
10539
- ucs1lib_find_slice , ucs2lib_find_slice , ucs4lib_find_slice ,
10599
+ asciilib_find_slice , ucs1lib_find_slice ,
10600
+ ucs2lib_find_slice , ucs4lib_find_slice ,
10540
10601
self , (PyObject * )substring , start , end
10541
10602
);
10542
10603
@@ -11548,7 +11609,8 @@ unicode_rfind(PyObject *self, PyObject *args)
11548
11609
return NULL ;
11549
11610
11550
11611
result = any_find_slice (
11551
- ucs1lib_rfind_slice , ucs2lib_rfind_slice , ucs4lib_rfind_slice ,
11612
+ asciilib_rfind_slice , ucs1lib_rfind_slice ,
11613
+ ucs2lib_rfind_slice , ucs4lib_rfind_slice ,
11552
11614
self , (PyObject * )substring , start , end
11553
11615
);
11554
11616
@@ -11583,7 +11645,8 @@ unicode_rindex(PyObject *self, PyObject *args)
11583
11645
return NULL ;
11584
11646
11585
11647
result = any_find_slice (
11586
- ucs1lib_rfind_slice , ucs2lib_rfind_slice , ucs4lib_rfind_slice ,
11648
+ asciilib_rfind_slice , ucs1lib_rfind_slice ,
11649
+ ucs2lib_rfind_slice , ucs4lib_rfind_slice ,
11587
11650
self , (PyObject * )substring , start , end
11588
11651
);
11589
11652
@@ -11712,7 +11775,10 @@ PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
11712
11775
11713
11776
switch (PyUnicode_KIND (str_obj )) {
11714
11777
case PyUnicode_1BYTE_KIND :
11715
- out = ucs1lib_partition (str_obj , buf1 , len1 , sep_obj , buf2 , len2 );
11778
+ if (PyUnicode_IS_ASCII (str_obj ) && PyUnicode_IS_ASCII (sep_obj ))
11779
+ out = asciilib_partition (str_obj , buf1 , len1 , sep_obj , buf2 , len2 );
11780
+ else
11781
+ out = ucs1lib_partition (str_obj , buf1 , len1 , sep_obj , buf2 , len2 );
11716
11782
break ;
11717
11783
case PyUnicode_2BYTE_KIND :
11718
11784
out = ucs2lib_partition (str_obj , buf1 , len1 , sep_obj , buf2 , len2 );
@@ -11781,7 +11847,10 @@ PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
11781
11847
11782
11848
switch (PyUnicode_KIND (str_in )) {
11783
11849
case PyUnicode_1BYTE_KIND :
11784
- out = ucs1lib_rpartition (str_obj , buf1 , len1 , sep_obj , buf2 , len2 );
11850
+ if (PyUnicode_IS_ASCII (str_obj ) && PyUnicode_IS_ASCII (sep_obj ))
11851
+ out = asciilib_rpartition (str_obj , buf1 , len1 , sep_obj , buf2 , len2 );
11852
+ else
11853
+ out = ucs1lib_rpartition (str_obj , buf1 , len1 , sep_obj , buf2 , len2 );
11785
11854
break ;
11786
11855
case PyUnicode_2BYTE_KIND :
11787
11856
out = ucs2lib_rpartition (str_obj , buf1 , len1 , sep_obj , buf2 , len2 );
0 commit comments