Skip to content

Commit 569fdb7

Browse files
committed
use unicode character categories to classify identifier characters
fixes #6797, fixes #5936
1 parent 56e04cb commit 569fdb7

File tree

2 files changed

+63
-13
lines changed

2 files changed

+63
-13
lines changed

src/flisp/julia_extensions.c

+61-3
Original file line numberDiff line numberDiff line change
@@ -40,11 +40,67 @@ value_t fl_skipws(value_t *args, u_int32_t nargs)
4040
return skipped;
4141
}
4242

43+
static int is_wc_cat_id_start(uint32_t wc, utf8proc_propval_t cat)
44+
{
45+
return (cat == UTF8PROC_CATEGORY_LU || cat == UTF8PROC_CATEGORY_LL ||
46+
cat == UTF8PROC_CATEGORY_LT || cat == UTF8PROC_CATEGORY_LM ||
47+
cat == UTF8PROC_CATEGORY_LO || cat == UTF8PROC_CATEGORY_NL ||
48+
// allow currency symbols
49+
cat == UTF8PROC_CATEGORY_SC ||
50+
// allow all latin-1 characters except math symbols and quotes
51+
(wc <= 0xff && cat != UTF8PROC_CATEGORY_SM &&
52+
cat != UTF8PROC_CATEGORY_PF && cat != UTF8PROC_CATEGORY_PI) ||
53+
// Other_ID_Start
54+
wc == 0x2118 || wc == 0x212E || (wc >= 0x309B && wc <= 0x309C));
55+
}
56+
57+
static int jl_id_start_char(uint32_t wc)
58+
{
59+
if ((wc >= 'A' && wc <= 'Z') || (wc >= 'a' && wc <= 'z') || wc == '_')
60+
return 1;
61+
if (wc < 0xA1 || wc > 0x10ffff)
62+
return 0;
63+
const utf8proc_property_t *prop = utf8proc_get_property(wc);
64+
return is_wc_cat_id_start(wc, prop->category);
65+
}
66+
4367
static int jl_id_char(uint32_t wc)
4468
{
45-
return ((wc >= 'A' && wc <= 'Z') || (wc >= 'a' && wc <= 'z') ||
46-
(wc >= '0' && wc <= '9') || (wc >= 0xA1) ||
47-
wc == '!' || wc == '_');
69+
if ((wc >= 'A' && wc <= 'Z') || (wc >= 'a' && wc <= 'z') || wc == '_' ||
70+
(wc >= '0' && wc <= '9') || wc == '!')
71+
return 1;
72+
if (wc < 0xA1 || wc > 0x10ffff)
73+
return 0;
74+
const utf8proc_property_t *prop = utf8proc_get_property(wc);
75+
utf8proc_propval_t cat = prop->category;
76+
if (is_wc_cat_id_start(wc, cat)) return 1;
77+
if (cat == UTF8PROC_CATEGORY_MN || cat == UTF8PROC_CATEGORY_MC ||
78+
cat == UTF8PROC_CATEGORY_ND || cat == UTF8PROC_CATEGORY_PC ||
79+
cat == UTF8PROC_CATEGORY_SK ||
80+
// primes
81+
(wc >= 0x2032 && wc <= 0x2034) ||
82+
// Other_ID_Continue
83+
wc == 0x0387 || wc == 0x19da || (wc >= 0x1369 && wc <= 0x1371))
84+
return 1;
85+
return 0;
86+
}
87+
88+
value_t fl_julia_identifier_char(value_t *args, u_int32_t nargs)
89+
{
90+
argcount("identifier-char?", nargs, 1);
91+
if (!iscprim(args[0]) || ((cprim_t*)ptr(args[0]))->type != wchartype)
92+
type_error("identifier-char?", "wchar", args[0]);
93+
uint32_t wc = *(uint32_t*)cp_data((cprim_t*)ptr(args[0]));
94+
return jl_id_char(wc);
95+
}
96+
97+
value_t fl_julia_identifier_start_char(value_t *args, u_int32_t nargs)
98+
{
99+
argcount("identifier-start-char?", nargs, 1);
100+
if (!iscprim(args[0]) || ((cprim_t*)ptr(args[0]))->type != wchartype)
101+
type_error("identifier-start-char?", "wchar", args[0]);
102+
uint32_t wc = *(uint32_t*)cp_data((cprim_t*)ptr(args[0]));
103+
return jl_id_start_char(wc);
48104
}
49105

50106
// return NFC-normalized UTF8-encoded version of s
@@ -105,6 +161,8 @@ value_t fl_accum_julia_symbol(value_t *args, u_int32_t nargs)
105161
static builtinspec_t julia_flisp_func_info[] = {
106162
{ "skip-ws", fl_skipws },
107163
{ "accum-julia-symbol", fl_accum_julia_symbol },
164+
{ "identifier-char?", fl_julia_identifier_char },
165+
{ "identifier-start-char?", fl_julia_identifier_start_char },
108166
{ NULL, NULL }
109167
};
110168

src/julia-parser.scm

+2-10
Original file line numberDiff line numberDiff line change
@@ -117,14 +117,6 @@
117117
(let ((chrs (string->list "()[]{},;\"`@")))
118118
(lambda (c) (memv c chrs))))
119119
(define (newline? c) (eqv? c #\newline))
120-
(define (identifier-char? c) (or (and (char>=? c #\A)
121-
(char<=? c #\Z))
122-
(and (char>=? c #\a)
123-
(char<=? c #\z))
124-
(and (char>=? c #\0)
125-
(char<=? c #\9))
126-
(char>=? c #\uA1)
127-
(eqv? c #\_)))
128120
;; characters that can be in an operator
129121
(define (opchar? c) (and (char? c) (string.find op-chars c)))
130122
;; characters that can follow . in an operator
@@ -418,7 +410,7 @@
418410

419411
((opchar? c) (read-operator port (read-char port)))
420412

421-
((identifier-char? c) (accum-julia-symbol c port))
413+
((identifier-start-char? c) (accum-julia-symbol c port))
422414

423415
(else (error (string "invalid character \"" (read-char port) "\""))))))
424416

@@ -1523,7 +1515,7 @@
15231515
(define (parse-interpolate s)
15241516
(let* ((p (ts:port s))
15251517
(c (peek-char p)))
1526-
(cond ((identifier-char? c)
1518+
(cond ((identifier-start-char? c)
15271519
(parse-atom s))
15281520
((eqv? c #\()
15291521
(read-char p)

0 commit comments

Comments
 (0)