Skip to content

Commit ce0eb71

Browse files
Fix encoding support for ruby-1.9.x
1 parent 5376273 commit ce0eb71

File tree

5 files changed

+103
-89
lines changed

5 files changed

+103
-89
lines changed

README.markdown

+6-4
Original file line numberDiff line numberDiff line change
@@ -8,18 +8,18 @@
88

99
### Manual
1010

11-
git clone https://github.com/ssoper/summarize.git
11+
git clone https://github.com/svenyurgensson/summarize.git
1212
cd summarize
1313
rake build
1414
gem build summarize.gemspec
15-
gem install summarize-1.0.3.gem
15+
gem install summarize-1.0.4.gem
1616

1717
## Usage
1818

1919
The summarize method is added to File which you can use to summarize the contents of any plain text file
2020

2121
File.open('path/to/file').summarize
22-
22+
2323
Or use the String method
2424

2525
"text to summarize".summarize
@@ -43,13 +43,15 @@ Topics can also be returned
4343
## Dependencies
4444

4545
You must have glib-2.0 and libxml-2.0 installed and properly configured.
46-
46+
4747
## Author
4848

4949
Gem written by Sean Soper ([@ssoper](http://twitter.com/ssoper))
5050

5151
The Open Text Summarizer library was written by Nadav Rotem and can be found at <http://libots.sourceforge.net/>
5252

53+
Gem fixed to handle UTF-8 encoding by Yury Batenko ([@svenyurgensson](http://twitter.com/svenyurgensson))
54+
5355
## License
5456

5557
Copyright (C) 2010 Sean Soper <[email protected]>

ext/summarize/dictionary.c

+81-81
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
* dictionary.c
33
*
44
* Copyright (C) 2003 Nadav Rotem <[email protected]>
5-
*
5+
*
66
* This program is free software; you can redistribute it and/or modify
77
* it under the terms of the GNU General Public License as published by
88
* the Free Software Foundation; either version 2 of the License, or
@@ -40,28 +40,28 @@ ots_load_xml_dictionary (OtsArticle * Doc,unsigned const char *name)
4040
xmlNodePtr stem=NULL;
4141
xmlNodePtr pre=NULL;
4242
xmlNodePtr post=NULL;
43-
xmlNodePtr syno=NULL; /* synonyms */
44-
xmlNodePtr manual=NULL; /* manual */
45-
xmlNodePtr step1_pre=NULL; /* step1 */
46-
xmlNodePtr step1_post=NULL; /* step1 */
47-
48-
xmlNodePtr parse=NULL; /* parser rules */
49-
xmlNodePtr pbreak=NULL;
50-
xmlNodePtr pdbreak=NULL;
51-
52-
xmlNodePtr tc_words=NULL; /* term count dictionary */
53-
xmlNodePtr tf_words=NULL; /* term frequency dictionary */
54-
43+
xmlNodePtr syno=NULL; /* synonyms */
44+
xmlNodePtr manual=NULL; /* manual */
45+
xmlNodePtr step1_pre=NULL; /* step1 */
46+
xmlNodePtr step1_post=NULL; /* step1 */
47+
48+
xmlNodePtr parse=NULL; /* parser rules */
49+
xmlNodePtr pbreak=NULL;
50+
xmlNodePtr pdbreak=NULL;
51+
52+
xmlNodePtr tc_words=NULL; /* term count dictionary */
53+
xmlNodePtr tf_words=NULL; /* term frequency dictionary */
54+
5555

5656
OtsStemRule * rule=Doc->stem;
57-
57+
5858
char *local_dict_name;
59-
59+
6060
local_dict_name = g_strdup_printf ("%s.xml", name);
6161

6262

63-
if (g_file_test(local_dict_name,G_FILE_TEST_EXISTS))
64-
doc = xmlParseFile (local_dict_name); /* it warns to the screen so we cant use it; enable for web services only */
63+
if (g_file_test(local_dict_name,G_FILE_TEST_EXISTS))
64+
doc = xmlParseFile (local_dict_name); /* it warns to the screen so we cant use it; enable for web services only */
6565
if (doc == NULL) return (FALSE);
6666

6767
head = xmlDocGetRootElement (doc);
@@ -82,23 +82,23 @@ ots_load_xml_dictionary (OtsArticle * Doc,unsigned const char *name)
8282
if (head != NULL)
8383
stem = head->xmlChildrenNode;
8484
while ((stem != NULL)
85-
&& (xmlStrcmp (stem->name, (const xmlChar *) "stemmer")))
85+
&& (xmlStrcmp (stem->name, (const xmlChar *) "stemmer")))
8686
{
8787
stem = stem->next;
8888
}
8989

9090
if (head != NULL)
9191
parse = head->xmlChildrenNode;
9292
while ((parse != NULL)
93-
&& (xmlStrcmp (parse->name, (const xmlChar *) "parser")))
93+
&& (xmlStrcmp (parse->name, (const xmlChar *) "parser")))
9494
{
9595
parse = parse->next;
9696
}
9797

9898
if (head != NULL)
9999
tc_words = head->xmlChildrenNode;
100100
while ((tc_words != NULL)
101-
&& (xmlStrcmp (tc_words->name, (const xmlChar *) "grader-tc")))
101+
&& (xmlStrcmp (tc_words->name, (const xmlChar *) "grader-tc")))
102102
{
103103
tc_words = tc_words->next;
104104
}
@@ -107,12 +107,12 @@ ots_load_xml_dictionary (OtsArticle * Doc,unsigned const char *name)
107107
if (head != NULL)
108108
tf_words = head->xmlChildrenNode;
109109
while ((tf_words != NULL)
110-
&& (xmlStrcmp (tf_words->name, (const xmlChar *) "grader-tf")))
110+
&& (xmlStrcmp (tf_words->name, (const xmlChar *) "grader-tf")))
111111
{
112112
tf_words = tf_words->next;
113113
}
114-
115-
114+
115+
116116

117117
if (stem != NULL)
118118
pre = stem->xmlChildrenNode;
@@ -132,15 +132,15 @@ ots_load_xml_dictionary (OtsArticle * Doc,unsigned const char *name)
132132
if (stem != NULL)
133133
syno = stem->xmlChildrenNode;
134134
while ((syno != NULL)
135-
&& (xmlStrcmp (syno->name, (const xmlChar *) "synonyms")))
135+
&& (xmlStrcmp (syno->name, (const xmlChar *) "synonyms")))
136136
{
137137
syno = syno->next;
138138
}
139139

140140
if (stem != NULL)
141141
manual = stem->xmlChildrenNode;
142142
while ((manual != NULL)
143-
&& (xmlStrcmp (manual->name, (const xmlChar *) "manual")))
143+
&& (xmlStrcmp (manual->name, (const xmlChar *) "manual")))
144144
{
145145
manual = manual->next;
146146
}
@@ -149,31 +149,31 @@ ots_load_xml_dictionary (OtsArticle * Doc,unsigned const char *name)
149149
if (stem != NULL)
150150
step1_pre = stem->xmlChildrenNode;
151151
while ((step1_pre != NULL)
152-
&& (xmlStrcmp (step1_pre->name, (const xmlChar *) "step1_pre")))
152+
&& (xmlStrcmp (step1_pre->name, (const xmlChar *) "step1_pre")))
153153
{
154154
step1_pre = step1_pre->next;
155155
}
156-
157-
158-
156+
157+
158+
159159
if (stem != NULL)
160160
step1_post = stem->xmlChildrenNode;
161161
while ((step1_post != NULL)
162-
&& (xmlStrcmp (step1_post->name, (const xmlChar *) "step1_post")))
162+
&& (xmlStrcmp (step1_post->name, (const xmlChar *) "step1_post")))
163163
{
164164
step1_post = step1_post->next;
165165
}
166166

167167

168168
if (pre != NULL)
169-
pre = pre->xmlChildrenNode; /*point to first word */
169+
pre = pre->xmlChildrenNode; /*point to first word */
170170
while (pre != NULL)
171171
{
172172
if (0 == xmlStrcmp (pre->name, (const xmlChar *) "rule"))
173-
rule->RemovePre =
174-
g_list_append (rule->RemovePre,
175-
(xmlNodeListGetString
176-
(doc, pre->xmlChildrenNode, 1)));
173+
rule->RemovePre =
174+
g_list_append (rule->RemovePre,
175+
(xmlNodeListGetString
176+
(doc, pre->xmlChildrenNode, 1)));
177177
pre = pre->next;
178178
}
179179

@@ -183,10 +183,10 @@ ots_load_xml_dictionary (OtsArticle * Doc,unsigned const char *name)
183183
while (post != NULL)
184184
{
185185
if (0 == xmlStrcmp (post->name, (const xmlChar *) "rule"))
186-
rule->RemovePost =
187-
g_list_append (rule->RemovePost,
188-
(xmlNodeListGetString
189-
(doc, post->xmlChildrenNode, 1)));
186+
rule->RemovePost =
187+
g_list_append (rule->RemovePost,
188+
(xmlNodeListGetString
189+
(doc, post->xmlChildrenNode, 1)));
190190
post = post->next;
191191
}
192192

@@ -195,10 +195,10 @@ ots_load_xml_dictionary (OtsArticle * Doc,unsigned const char *name)
195195
while (syno != NULL)
196196
{
197197
if (0 == xmlStrcmp (syno->name, (const xmlChar *) "rule"))
198-
rule->synonyms =
199-
g_list_append (rule->synonyms,
200-
(xmlNodeListGetString
201-
(doc, syno->xmlChildrenNode, 1)));
198+
rule->synonyms =
199+
g_list_append (rule->synonyms,
200+
(xmlNodeListGetString
201+
(doc, syno->xmlChildrenNode, 1)));
202202
syno = syno->next;
203203
}
204204

@@ -207,10 +207,10 @@ ots_load_xml_dictionary (OtsArticle * Doc,unsigned const char *name)
207207
while (manual != NULL)
208208
{
209209
if (0 == xmlStrcmp (manual->name, (const xmlChar *) "rule"))
210-
rule->manual =
211-
g_list_append (rule->manual,
212-
(xmlNodeListGetString
213-
(doc, manual->xmlChildrenNode, 1)));
210+
rule->manual =
211+
g_list_append (rule->manual,
212+
(xmlNodeListGetString
213+
(doc, manual->xmlChildrenNode, 1)));
214214
manual = manual->next;
215215
}
216216

@@ -222,10 +222,10 @@ ots_load_xml_dictionary (OtsArticle * Doc,unsigned const char *name)
222222
while (step1_pre != NULL)
223223
{
224224
if (0 == xmlStrcmp (step1_pre->name, (const xmlChar *) "rule"))
225-
rule->step1_pre =
226-
g_list_append (rule->step1_pre,
227-
(xmlNodeListGetString
228-
(doc, step1_pre->xmlChildrenNode, 1)));
225+
rule->step1_pre =
226+
g_list_append (rule->step1_pre,
227+
(xmlNodeListGetString
228+
(doc, step1_pre->xmlChildrenNode, 1)));
229229
step1_pre = step1_pre->next;
230230
}
231231

@@ -236,10 +236,10 @@ ots_load_xml_dictionary (OtsArticle * Doc,unsigned const char *name)
236236
while (step1_post != NULL)
237237
{
238238
if (0 == xmlStrcmp (step1_post->name, (const xmlChar *) "rule"))
239-
rule->step1_post =
240-
g_list_append (rule->step1_post,
241-
(xmlNodeListGetString
242-
(doc, step1_post->xmlChildrenNode, 1)));
239+
rule->step1_post =
240+
g_list_append (rule->step1_post,
241+
(xmlNodeListGetString
242+
(doc, step1_post->xmlChildrenNode, 1)));
243243
step1_post = step1_post->next;
244244
}
245245

@@ -258,18 +258,18 @@ ots_load_xml_dictionary (OtsArticle * Doc,unsigned const char *name)
258258
{
259259
pdbreak = pdbreak->next;
260260
}
261-
262-
261+
262+
263263
/*Parser break*/
264264
if (pbreak != NULL)
265265
pbreak = pbreak->xmlChildrenNode;
266266
while (pbreak != NULL)
267267
{
268268
if (0 == xmlStrcmp (pbreak->name, (const xmlChar *) "rule"))
269-
rule->ParserBreak =
270-
g_list_append (rule->ParserBreak,
271-
(xmlNodeListGetString
272-
(doc, pbreak->xmlChildrenNode, 1)));
269+
rule->ParserBreak =
270+
g_list_append (rule->ParserBreak,
271+
(xmlNodeListGetString
272+
(doc, pbreak->xmlChildrenNode, 1)));
273273
pbreak = pbreak->next;
274274
}
275275

@@ -279,10 +279,10 @@ ots_load_xml_dictionary (OtsArticle * Doc,unsigned const char *name)
279279
while (pdbreak != NULL)
280280
{
281281
if (0 == xmlStrcmp (pdbreak->name, (const xmlChar *) "rule"))
282-
rule->ParserDontBreak =
283-
g_list_append (rule->ParserDontBreak,
284-
(xmlNodeListGetString
285-
(doc, pdbreak->xmlChildrenNode, 1)));
282+
rule->ParserDontBreak =
283+
g_list_append (rule->ParserDontBreak,
284+
(xmlNodeListGetString
285+
(doc, pdbreak->xmlChildrenNode, 1)));
286286
pdbreak = pdbreak->next;
287287
}
288288

@@ -293,37 +293,37 @@ ots_load_xml_dictionary (OtsArticle * Doc,unsigned const char *name)
293293
while (tc_words != NULL)
294294
{
295295
if (0 == xmlStrcmp (tc_words->name, (const xmlChar *) "word"))
296-
{
297-
xmlChar *key;
298-
key=xmlNodeListGetString(doc, tc_words->xmlChildrenNode,1);
299-
Doc->dict = g_list_append (Doc->dict,(gpointer)ots_new_wordEntery(key));
296+
{
297+
xmlChar *key;
298+
key=xmlNodeListGetString(doc, tc_words->xmlChildrenNode,1);
299+
Doc->dict = g_list_append (Doc->dict,(gpointer)ots_new_wordEntery(key));
300300
xmlFree(key);
301301
}
302302
tc_words = tc_words->next;
303303
}
304-
305-
304+
305+
306306
/*Term Frequency load dict*/
307-
307+
308308
if (tf_words != NULL)
309309
tf_words = tf_words->xmlChildrenNode;
310310
while (tf_words != NULL)
311311
{
312312
if (0 == xmlStrcmp (tf_words->name, (const xmlChar *) "word"))
313-
{
314-
xmlChar *key;
315-
xmlChar *idf_key;
316-
key=xmlNodeListGetString(doc, tf_words->xmlChildrenNode,1);
317-
318-
idf_key=xmlGetProp(tf_words,"idf");
319-
Doc->tf_terms = g_list_append (Doc->tf_terms,ots_new_OtsWordTF(key,atof(idf_key)));
313+
{
314+
xmlChar *key;
315+
xmlChar *idf_key;
316+
key=xmlNodeListGetString(doc, tf_words->xmlChildrenNode,1);
317+
318+
idf_key=xmlGetProp(tf_words,"idf");
319+
Doc->tf_terms = g_list_append (Doc->tf_terms,ots_new_OtsWordTF(key,atof(idf_key)));
320320
xmlFree(key);
321321
xmlFree(idf_key);
322322
}
323323
tf_words = tf_words->next;
324324
}
325-
326-
325+
326+
327327
xmlFreeDoc(doc);
328328
xmlCleanupParser ();
329329
g_free(local_dict_name);

ext/summarize/summarize.c

+12
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,10 @@
66
#include <glib-object.h>
77
#include <ruby.h>
88

9+
#ifdef HAVE_RUBY_ENCODING_H
10+
#include <ruby/encoding.h>
11+
#endif
12+
913
#include "libots.h"
1014
#include "summarize.h"
1115

@@ -16,6 +20,9 @@ void Init_summarize() {
1620
}
1721

1822
static VALUE summarize(const VALUE self, volatile VALUE rb_str, volatile VALUE rb_dict_file, const VALUE rb_ratio, const VALUE rb_topics) {
23+
#ifdef HAVE_RUBY_ENCODING_H
24+
int enc = rb_enc_find_index("UTF-8");
25+
#endif
1926
long int length = RSTRING_LEN(rb_str);
2027
char *text = StringValuePtr(rb_str);
2128
char *dictionary_file = StringValuePtr(rb_dict_file);
@@ -41,6 +48,11 @@ static VALUE summarize(const VALUE self, volatile VALUE rb_str, volatile VALUE r
4148
summary = rb_str_new2(ots_get_doc_text(doc, &result_len));
4249
topics = rb_str_new2((const char *)doc->title);
4350

51+
#ifdef HAVE_RUBY_ENCODING_H
52+
rb_enc_associate_index(summary, enc);
53+
rb_enc_associate_index(summary, enc);
54+
#endif
55+
4456
ots_free_article(doc);
4557

4658
if (rb_topics == Qtrue) {

0 commit comments

Comments
 (0)