-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlemma.lisp
97 lines (87 loc) · 4.48 KB
/
lemma.lisp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
;; -*- mode: Lisp; coding: utf-8-unix; -*-
;; Copyright (c) 2024, April & May
;; SPDX-License-Identifier: 0BSD
(in-package aprnlp)
(defclass lemmatizer ()
((name :initform "unnamed-lemmatizer"
:accessor lemmatizer-name)
(dict :initform (make-hash-table :test #'eq)
:accessor lemmatizer-dict)))
(defparameter *loaded-lemmatizer* nil)
(defmethod load-processor ((class (eql 'lemmatizer)) file)
#+lispworks (hcl:load-data-file file)
#-lispworks (load file)
*loaded-lemmatizer*)
(defmethod save-processor ((lemmatizer lemmatizer) directory)
(with-slots (name dict) lemmatizer
(let ((filename (make-pathname :name name :type "fasl" :defaults directory)))
#+lispworks
(hcl:with-output-to-fasl-file (out filename :overwrite t)
(hcl:dump-form '(setf *loaded-lemmatizer* (make-lemmatizer)) out)
(hcl:dump-form `(setf (lemmatizer-name *loaded-lemmatizer*) ,name) out)
(hcl:dump-form `(setf (lemmatizer-dict *loaded-lemmatizer*) (plist-to-table ',(table-to-plist dict)))
out))
#-lispworks
(let ((src (make-pathname :name name :type "lisp" :defaults directory)))
(with-open-file (out src
:direction :output
:if-exists :supersede
:if-does-not-exist :create)
(prin1 '(setf *loaded-lemmatizer* (make-lemmatizer)) out)
(prin1 `(setf (lemmatizer-name *loaded-lemmatizer*) ,name) out)
(prin1 `(setf (lemmatizer-dict *loaded-lemmatizer*) (plist-to-table ',(table-to-plist dict)))
out))
(compile-file src :output-file filename)
(delete-file src))
(log-info "Dependency Parser saved, size: ~A" (print-size (file-size-in-octets filename))))))
(defun lemmatize (lemmatizer word)
(let ((form (word-form word)))
(setf (word-lemma word)
(gethash form (lemmatizer-dict lemmatizer) form))))
(defmethod process ((lemmatizer lemmatizer) sentence)
(iter (for word :in-vector sentence)
(lemmatize lemmatizer word)))
(defmethod train ((lemmatizer lemmatizer) sentences &key cycles save-dir)
(declare (ignore cycles))
(unless save-dir
(setq save-dir (asdf/system:system-source-directory :aprnlp)))
(log-info "Start training with ~D sentences. ~A"
(length sentences)
#+lispworks (lw:string-append "Heap size: " (print-size (getf (sys:room-values) :total-size)))
#-lispworks "")
(let ((start-time (get-internal-real-time)))
(iter (for sentence :in-vector sentences)
(iter (for word :in-vector sentence)
(setf (gethash (word-form word) (lemmatizer-dict lemmatizer))
(word-lemma word))))
(save-processor lemmatizer save-dir)
(log-info "Training completed using ~,2Fs. ~A"
(/ (- (get-internal-real-time) start-time) 1000)
#+lispworks (lw:string-append "Heap size: " (print-size (getf (sys:room-values) :total-size)))
#-lispworks "")))
(defmethod test ((lemmatizer lemmatizer) sentences)
(let ((correct-count 0)
(total-count 0)
(start-time (get-internal-real-time)))
(iter (for sentence :in-vector sentences)
(for new-sentence :next (coerce (iter (for word :in-vector sentence)
(collect (copy-word word)))
'vector))
(process lemmatizer new-sentence)
(iter (for truth :in-vector sentence)
(for guess :in-vector new-sentence)
(when (= (word-head guess) (word-head truth))
(incf correct-count))
(incf total-count)))
(log-info "Test ~D sentences using ~,2Fs, result: ~D/~D (~,2F%)"
(length sentences)
#+lispworks (/ (- (get-internal-real-time) start-time) 1000)
#-lispworks (/ (- (get-internal-real-time) start-time) 1000000)
correct-count total-count (* 100 (/ correct-count total-count)))
(float (* 100 (/ correct-count total-count)))))
(defmethod test-training ((class (eql 'lemmatizer)))
(let ((lemmatizer (make-instance 'lemmatizer))
(ud-dir (merge-pathnames "ud-treebanks-v2.14/" (asdf:system-source-directory :aprnlp))))
(train lemmatizer (read-conllu-files (merge-pathnames "UD_English-GUM/en_gum-ud-train.conllu" ud-dir)))
(test lemmatizer (read-conllu-files (merge-pathnames "UD_English-GUM/en_gum-ud-test.conllu" ud-dir)))
(setq *loaded-lemmatizer* lemmatizer)))