-
Notifications
You must be signed in to change notification settings - Fork 18
/
Copy pathStringDistances.jl
181 lines (163 loc) · 5.8 KB
/
StringDistances.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
__precompile__(true)
module StringDistances
##############################################################################
##
## Export
##
##############################################################################
import Base: eltype, length, start, done, next, ==, hash, isless, convert, show, endof
import Base.UTF8proc: isgraphemebreak
import Distances: evaluate, Hamming, hamming, PreMetric, SemiMetric
import Iterators: chain
export
evaluate,
compare,
Hamming,
Levenshtein,
DamerauLevenshtein,
Jaro,
QGram,
Cosine,
Jaccard,
SorensenDice,
Overlap,
longest_common_substring,
matching_blocks,
RatcliffObershelp,
Winkler,
Partial,
TokenSort,
TokenSet,
TokenMax,
graphemeiterator
##############################################################################
##
## Define GraphemeIterator as AbstractString
##
## Argument for AbstractString inheritance:
## (i) prevind, nextind, chr2ind, are defined once start, next, done, isvalid, endof are defined
## (ii) SubString(x::GraphemeIterator, i, j) works
## (ii) I can define functions with AbstractString signature in this package (but I could also just define a union type)
## Argument for non inheritance:
## (i) AbstractString gives char as individual. Important for print_escaped & search.
## (ii) How to make split return GraphemeIterator rather than strings? How to join multiple GraphemeIterator w/o rewriting join?
##
##############################################################################
# from Base. I redefine it because I want AbstractStringinheritance
immutable GraphemeIterator{S<:AbstractString} <: AbstractString
s::S # original string (for generation of SubStrings)
end
graphemeiterator(s::AbstractString) = GraphemeIterator{typeof(s)}(s)
eltype{S}(::Type{GraphemeIterator{S}}) = SubString{S}
function length(g::GraphemeIterator)
c0 = Char(0x00ad) # soft hyphen (grapheme break always allowed after this)
n = 0
for c in g.s
n += isgraphemebreak(c0, c)
c0 = c
end
return n
end
start(g::GraphemeIterator) = start(g.s)
done(g::GraphemeIterator, i::Int) = done(g.s, i)
function next(g::GraphemeIterator, i::Int)
s = g.s
j = i
c0, k = next(s, i)
while !done(s, k) # loop until next grapheme is s[i:j]
c, ℓ = next(s, k)
isgraphemebreak(c0, c) && break
j = k
k = ℓ
c0 = c
end
return (SubString(s, i, j), k)
end
==(g1::GraphemeIterator, g2::GraphemeIterator) = g1.s == g2.s
hash(g::GraphemeIterator, h::UInt) = hash(g.s, h)
isless(g1::GraphemeIterator, g2::GraphemeIterator) = isless(g1.s, g2.s)
show{S}(io::IO, g::GraphemeIterator{S}) = print(io, "length-$(length(g)) GraphemeIterator{$S} for \"$(g.s)\"")
# added
#these 2 functions allow to define prevind nextind, chr2ind, prevind etc
function Base.isvalid(s::GraphemeIterator, i::Integer)
if !isvalid(s.s, i)
return false
else
i0 = prevind(s.s, i)
return i0 < start(s.s) || isgraphemebreak(s.s[i0], s.s[i])
end
end
function endof(s::GraphemeIterator)
c0 = Char(0x00ad)
i = endof(s.s)
i0 = start(s.s)
while i >= i0 && !isgraphemebreak(s.s[i], c0)
i = prevind(s.s, i)
c0 = s.s[i]
end
i
end
# 1. issues with stuff like search or print_escaped where character iteration vs string iteration matters. I need to pass the original string for now
Base.search(x::GraphemeIterator, s::Vector{Char}) = search(x.s, s)
# 2. issue with keeping iterator property for stuff like split, join. for now, I decide to loose the enumerator property but add it back after join. But SubString for instance does not loose the property
Base.split(x::GraphemeIterator, args...) = split(x.s, args...)
iterator{T <: GraphemeIterator}(::Type{T}, x::AbstractString) = graphemeiterator(x)
iterator{T <: AbstractString}(::Type{T}, x::AbstractString) = x
##############################################################################
##
## include
##
##############################################################################
include("distances/edit.jl")
include("distances/qgram.jl")
include("distances/RatcliffObershelp.jl")
include("modifiers/winkler.jl")
include("modifiers/fuzzywuzzy.jl")
##############################################################################
##
## Higher level functions
##
##############################################################################
for x in (:evaluate, :compare)
@eval begin
function $x(dist::PreMetric, s1::AbstractString, s2::AbstractString)
len1, len2 = length(s1), length(s2)
if len1 > len2
return $x(dist, s2, s1, len2, len1)
else
return $x(dist, s1, s2, len1, len2)
end
end
end
end
##############################################################################
##
## compare
##
##############################################################################
function compare(dist::PreMetric, s1::AbstractString, s2::AbstractString,
len1::Integer, len2::Integer)
1.0 - evaluate(dist, s1, s2, len1, len2)
end
function compare(dist::Union{Hamming, Levenshtein, DamerauLevenshtein},
s1::AbstractString, s2::AbstractString,
len1::Integer, len2::Integer)
distance = evaluate(dist, s1, s2, len1, len2)
len2 == 0 ? 1.0 : 1.0 - distance / len2
end
# compare always return a value between 0 and 1.
# When string length < q for qgram distance, returns s1 == s2
function compare(dist::AbstractQGram,
s1::AbstractString, s2::AbstractString,
len1::Integer, len2::Integer)
len1 <= (dist.q - 1) && return convert(Float64, s1 == s2)
evaluate(dist, s1, s2, len1, len2)
end
function compare(dist::QGram,
s1::AbstractString, s2::AbstractString,
len1::Integer, len2::Integer)
len1 <= (dist.q - 1) && return convert(Float64, s1 == s2)
distance = evaluate(dist, s1, s2, len1, len2)
1 - distance / (len1 + len2 - 2 * dist.q + 2)
end
end