Skip to content

Commit 4b7c485

Browse files
authored
feat: add stop words for Hungarian language (#2069)
1 parent 3942fc6 commit 4b7c485

File tree

3 files changed

+205
-1
lines changed

3 files changed

+205
-1
lines changed

src/tokenizer/stop_word_filter/gen_stopwords.py

+1
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
"finnish",
77
"french",
88
"german",
9+
"hungarian",
910
"italian",
1011
"norwegian",
1112
"portuguese",

src/tokenizer/stop_word_filter/mod.rs

+1
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ impl StopWordFilter {
5050
Language::Finnish => stopwords::FINNISH,
5151
Language::French => stopwords::FRENCH,
5252
Language::German => stopwords::GERMAN,
53+
Language::Hungarian => stopwords::HUNGARIAN,
5354
Language::Italian => stopwords::ITALIAN,
5455
Language::Norwegian => stopwords::NORWEGIAN,
5556
Language::Portuguese => stopwords::PORTUGUESE,

src/tokenizer/stop_word_filter/stopwords.rs

+203-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/*
22
These stop word lists are from the Snowball project (https://snowballstem.org/)
3-
which carries the following license:
3+
which carries the following copyright and license:
44
55
Copyright (c) 2001, Dr Martin Porter
66
Copyright (c) 2004,2005, Richard Boulton
@@ -862,6 +862,208 @@ pub const GERMAN: &[&str] = &[
862862
"zwischen",
863863
];
864864

865+
pub const HUNGARIAN: &[&str] = &[
866+
"a",
867+
"ahogy",
868+
"ahol",
869+
"aki",
870+
"akik",
871+
"akkor",
872+
"alatt",
873+
"által",
874+
"általában",
875+
"amely",
876+
"amelyek",
877+
"amelyekben",
878+
"amelyeket",
879+
"amelyet",
880+
"amelynek",
881+
"ami",
882+
"amit",
883+
"amolyan",
884+
"amíg",
885+
"amikor",
886+
"át",
887+
"abban",
888+
"ahhoz",
889+
"annak",
890+
"arra",
891+
"arról",
892+
"az",
893+
"azok",
894+
"azon",
895+
"azt",
896+
"azzal",
897+
"azért",
898+
"aztán",
899+
"azután",
900+
"azonban",
901+
"bár",
902+
"be",
903+
"belül",
904+
"benne",
905+
"cikk",
906+
"cikkek",
907+
"cikkeket",
908+
"csak",
909+
"de",
910+
"e",
911+
"eddig",
912+
"egész",
913+
"egy",
914+
"egyes",
915+
"egyetlen",
916+
"egyéb",
917+
"egyik",
918+
"egyre",
919+
"ekkor",
920+
"el",
921+
"elég",
922+
"ellen",
923+
"elő",
924+
"először",
925+
"előtt",
926+
"első",
927+
"én",
928+
"éppen",
929+
"ebben",
930+
"ehhez",
931+
"emilyen",
932+
"ennek",
933+
"erre",
934+
"ez",
935+
"ezt",
936+
"ezek",
937+
"ezen",
938+
"ezzel",
939+
"ezért",
940+
"és",
941+
"fel",
942+
"felé",
943+
"hanem",
944+
"hiszen",
945+
"hogy",
946+
"hogyan",
947+
"igen",
948+
"így",
949+
"illetve",
950+
"ill.",
951+
"ill",
952+
"ilyen",
953+
"ilyenkor",
954+
"ison",
955+
"ismét",
956+
"itt",
957+
"jó",
958+
"jól",
959+
"jobban",
960+
"kell",
961+
"kellett",
962+
"keresztül",
963+
"keressünk",
964+
"ki",
965+
"kívül",
966+
"között",
967+
"közül",
968+
"legalább",
969+
"lehet",
970+
"lehetett",
971+
"legyen",
972+
"lenne",
973+
"lenni",
974+
"lesz",
975+
"lett",
976+
"maga",
977+
"magát",
978+
"majd",
979+
"majd",
980+
"már",
981+
"más",
982+
"másik",
983+
"meg",
984+
"még",
985+
"mellett",
986+
"mert",
987+
"mely",
988+
"melyek",
989+
"mi",
990+
"mit",
991+
"míg",
992+
"miért",
993+
"milyen",
994+
"mikor",
995+
"minden",
996+
"mindent",
997+
"mindenki",
998+
"mindig",
999+
"mint",
1000+
"mintha",
1001+
"mivel",
1002+
"most",
1003+
"nagy",
1004+
"nagyobb",
1005+
"nagyon",
1006+
"ne",
1007+
"néha",
1008+
"nekem",
1009+
"neki",
1010+
"nem",
1011+
"néhány",
1012+
"nélkül",
1013+
"nincs",
1014+
"olyan",
1015+
"ott",
1016+
"össze",
1017+
"ő",
1018+
"ők",
1019+
"őket",
1020+
"pedig",
1021+
"persze",
1022+
"rá",
1023+
"s",
1024+
"saját",
1025+
"sem",
1026+
"semmi",
1027+
"sok",
1028+
"sokat",
1029+
"sokkal",
1030+
"számára",
1031+
"szemben",
1032+
"szerint",
1033+
"szinte",
1034+
"talán",
1035+
"tehát",
1036+
"teljes",
1037+
"tovább",
1038+
"továbbá",
1039+
"több",
1040+
"úgy",
1041+
"ugyanis",
1042+
"új",
1043+
"újabb",
1044+
"újra",
1045+
"után",
1046+
"utána",
1047+
"utolsó",
1048+
"vagy",
1049+
"vagyis",
1050+
"valaki",
1051+
"valami",
1052+
"valamint",
1053+
"való",
1054+
"vagyok",
1055+
"van",
1056+
"vannak",
1057+
"volt",
1058+
"voltam",
1059+
"voltak",
1060+
"voltunk",
1061+
"vissza",
1062+
"vele",
1063+
"viszont",
1064+
"volna",
1065+
];
1066+
8651067
pub const ITALIAN: &[&str] = &[
8661068
"ad",
8671069
"al",

0 commit comments

Comments
 (0)