-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsimple-tokenizer.lisp
22 lines (20 loc) · 1.01 KB
/
simple-tokenizer.lisp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
(in-package aprnlp)
(defvar *sentence-ending-regex*
(ppcre:create-scanner "(. )|(; )|(
)"))
(defun simple-tokenize (text)
(let* ((sentences-list (ppcre:split *sentence-ending-regex* text))
(sentences (make-array (length sentences-list) :element-type 'vector :fill-pointer 0)))
(iter (for sentence-string :in sentences-list)
(for words-list :next (split-sequence #\Space sentence-string :remove-empty-subseqs t))
(for sentence :next (make-array (length words-list) :element-type 'word :fill-pointer 0))
(iter (for id :from 0)
(for form :in words-list)
(vector-push
(make-word :id id
:form (intern form "POS/WORDS")
:suffix (intern (subseq form (- (length form) (min 3 (length form)))) "POS/WORDS")
:prefix (intern (subseq form 0 (min 3 (length form))) "POS/WORDS") )
sentence))
(vector-push sentence sentences))
sentences))