From c800fa7b574dddd974366ced08ce5e42f7238cb3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20Jaworski?= Date: Mon, 31 Dec 2018 11:13:16 +0100 Subject: [PATCH] lemmatizer fixed --- .../LemmaGenSockets/LemmatizerListener.cs | 20 +++++++++- .../bin/Debug/LemmaGenSockets.exe | Bin 7168 -> 7168 bytes .../bin/Debug/LemmaGenSockets.pdb | Bin 15872 -> 15872 bytes ...ckets.csprojResolveAssemblyReference.cache | Bin 13052 -> 13122 bytes .../obj/Debug/LemmaGenSockets.exe | Bin 7168 -> 7168 bytes .../obj/Debug/LemmaGenSockets.pdb | Bin 15872 -> 15872 bytes tests/lemmatizer-test/.gitignore | 2 + tests/lemmatizer-test/test.sh | 10 +++++ tests/lemmatizer-test/test_corpus.py | 36 ++++++++++++++++++ tests/lemmatizer-test/tokenize.sh | 7 ++++ 10 files changed, 73 insertions(+), 2 deletions(-) create mode 100644 tests/lemmatizer-test/.gitignore create mode 100755 tests/lemmatizer-test/test.sh create mode 100755 tests/lemmatizer-test/test_corpus.py create mode 100755 tests/lemmatizer-test/tokenize.sh diff --git a/LemmaGenSockets/LemmaGenSockets/LemmatizerListener.cs b/LemmaGenSockets/LemmaGenSockets/LemmatizerListener.cs index aaebab9..28afaf9 100644 --- a/LemmaGenSockets/LemmaGenSockets/LemmatizerListener.cs +++ b/LemmaGenSockets/LemmaGenSockets/LemmatizerListener.cs @@ -54,6 +54,12 @@ namespace LemmaGenSockets private string lemmatizeWord(string languageCode, string word) { // exceptions + if (word.StartsWith("ne_")) + { + return word; + } + + Dictionary> exceptions = new Dictionary>(); HashSet plExceptions = new HashSet(); @@ -76,6 +82,7 @@ namespace LemmaGenSockets } + string result = ""; string[] parts = word.Split(wordInnerSeparator); if (parts.Length == 2) { @@ -85,11 +92,20 @@ namespace LemmaGenSockets firstPart = lemmatizersDict[languageCode].Lemmatize(firstPart); } string secondPart = lemmatizersDict[languageCode].Lemmatize(parts[1]); - return firstPart + "-" + secondPart; + result = firstPart + "-" + secondPart; } else { - return lemmatizersDict[languageCode].Lemmatize(word); + result = lemmatizersDict[languageCode].Lemmatize(word); + } + + if (result == "") + { + return word; + } + else + { + return result; } } diff --git a/LemmaGenSockets/LemmaGenSockets/bin/Debug/LemmaGenSockets.exe b/LemmaGenSockets/LemmaGenSockets/bin/Debug/LemmaGenSockets.exe index a3ee5a9c6f09246b1daf7e7bea9141b1c03e214d..dfb65386f4a15752946443f376222a3041a5cdd9 100644 GIT binary patch delta 1940 zcmZ9NduS9#9LK*iJG*=D+}>U;mw3^;CYpf2cP-P?=`=f z`R(3t`*8b*d-)%?N^ehOcS^^4*ddp(CjFIGAmOHX)?LGz*oa$Xh2(`9P|q?VhmOWZ zpd$ciG&2ChEk;_)D|na{dYJTp1(BZw3^+A_&o(S0>xHn*_Bmu8jFZkcSWEPq`6lUA zNAeUipQ(_iKa!0+1r3t11Egc0ivBxrqF+TxIsLIkQ=@zm=SwkLM z#w3HgRip~@DNFh(G6p=!`%~#G8%PeuYZ8NWBQmiiyv}42Ntonwrbwy%wA`^J96D;+ zWz?Tph$IB?9-R~OS-Rn*rNwKen`ty+me--wzBFBjyrGq}rjsSP`eA{q!_km2P-FWu zr_ES}-Dyz)vA7a!!;CxANMn!X<(6TYX^px?WmMT6x@CD&y5(`G?R--#J?>DiQTe(v zQ;xJupuJ;sbkq*Q3YyL=Ql@9ShxLu~NL;3zquMW!H?)L)gXlz)W z!wAV?DKA~!wTMPRS?9Tnjk#>LTQnvYPH6@vbIIFyORK^qStFv%Pnv^@I~4<5YA!4J zZ^euv*nZMqgJiGLEMXsW13Sb$q95(FjWt@E76KzV0$c185u8{2MAR3AfqLy8kD*Vr zX>~YGG|)*J1K)DVg~SN%i+Tz>1IZJL7bt~+-TXRfnzVY758)p5GeGS_52K&l`Vy^G zbUvE>!sSe(Ei&vDGg*OQgEp7NFcr_T6-gOb#J^Mm!x=T(6W^+4H^*&5Sil(8sWpn| zqnU>9s3(Cvc!$c>ai6*pD8bN{F&%cJvZ*Wv5^7e8@6w)U^yvU|8$7jzyP6kUHAdeJ zxmGRFhh}0JGZg12b}Du$u2fv3*hj2JzmhjAZd3f8Scd>|NE2n|qGABI#XMZWQL%v7 zA-W1QJm4#cm-tRZl~ZDzIm{!COcBL0z7GkEXfpUbc?R%1KS&!L;GdGl&_BZwT*i+$ zMzQB`l6aB04p)hzoDr{xUy0rPCcG?;JH)T~2nw=_e<;Ov&e;{5<)&hgdD#su!cL+G zHz}+|i<5kjuC9oaRBkOhib-r5_1G$+jkUxH*iLM~8^jFu6K5&TRa~g_&*3oXb4s&T zauL?!OH#dt>%>9)$y(8kFVM)6wA8y0!?K$ypH@LHQiYJZbnzxjpVY z)}|HlBrbJ3eKtE&UhTWa_1sWjeyH3MoXP{vK=0zsTl!w_AKC`@g{qWcZo&rix?8HA z3CGrd5E@wC)p5M3@!VGvUOMM~Q`OzmWL8y27V}VbWI1~hq52RHan3^okz_pN``=Yp z0iGo1Ho`VDlFVi{$SV}m!Fl=R@JB3r<5si$u*g3-BAN)?0aoP}<#zq>bLzLY zaL17cS2vbUe7?7wjCfe&?D(IjjJhV}WGNM67u=P0S2+}Yp!2_Gl&!kEtki1$2enT! Aga7~l delta 1814 zcmZ9MU2IfE6vzKFclPe??RL9&Z@b-=B3s(hExSOeLQ230LJN(N6c!|4qai#PFy)rE ziBgE$8bO0WU9SX16C@!q#E*!5X+$4TK}-k_5(A>4F=A4q1iv2q3izKXR&aLnoBy11 zX6BsPnH_E)Zhzq!+0j|rcZV1%9orzXfe_81pXLE|LA}fe8%47i4n{;V(=#8Yts*5t zxY7W`Vt~bw4cLM3T1UHOLKLfoj)04SYQS*@(qvhABFP5?IoSBlDn9T@+-Ad zx$3#C>FdIaxRLxra8>G%7uB3hXgXt3^p?40`D(~r%tl%M9kjYppXobZ^mePNJ~U}( zKtIo?&vmj@I(sjiyL|y=;({?%UtGYrY=%LxNapw1z(759dAho8V5Xj_m{!#10XY;i z9KTVk_Ay*OuuY*c$?xv_BS7Qde}6H;j-JUg8B^R#%A>q^&l!&!^Jwpr1v?(+w7c%UIS z;?Jd=<%ZHe?+?=t`K`L=HnH}lH*VbU%HfuWeIAsPZ3`}kw%r|L2@OvGkF)<<>%w6F z*6oiB=xPU}z&iE~>kBwTozQya`mPo1FLHcWu*Y63+JcjIQ}Hd!Mw8V15kpo5&gvXd zDL~7shPw@8Qd=$>{j*`t5Mn>=m!bKvv2==q(ng<}Aj{CsGFDqTD-J=OL_|HUlK9y0 zMU^Zn8%b+IrtpBuS+zJ!+E_@7jSr>fax#glD#>Xlp!tsBDK25-N%;jWbykx4HB9ir z0=yM$f*)P_Vyj6ln;Kn^dStOgfyY#fC<<({+C&Pou~2kp^u!VT!W0Bfn9<&JuNh6; z^bKN>5a=;`j9@cI7KVBKfW3H@>4q@I8wix}m%K-WaNdk2SF!Q+q%S>SbqGFvh+s>k zrg+7QqSq39_j<1y(m@NEzd~fN%b28up_TW7En-t_Zb%|Uj#}O52d`G@7e=uA9N)E|C$WP>TLnWf(D=R9FV*|!HYo=8} z{Wb*5NzL5VrPMe_HgC{TJ)NCV` z+!}Jz_3r2q=dv8#SKcUt6BU`@+lmE=@wLnL%zpj!>i4cL?Jb<0*&JjO>#W9bEGQ;w z+NQ^)jLUdjI#tOI8Lvv-FXAD?NJdHz2~Vhuka0UBJmGcuHA3M`uIhq2bbKU8+{@7xhtblXNT{s zQk}ZSALM(j|FG)S7yRppE6UfaUgxp$q^savclJUFeW^0+X(3DKir|^zC%Te*yZ4Cr zCFJ$!4}-xQ;*34OPOHHZHF#AGJ*!e< zDs|pkR;H|>Nh?)irT$XGQMJ!&4R>1mLt;suM5NYhwox=kkA}*Y#;oiL()@VK&g7o1 zo}E1po8dBXdDHA-Jdb^`g_!<2bgFooN-vc(GoI5URjJ}Jk4QI_HB^r3s_K=QIb&4j zQE7ELtG8PNjVv-vs`yKo-@~z-Pd#;WG>F)yd>hWiE5BZ|ug564xyn{YAS zi7S!CZN{V-*Wp^+gfVg!=4UgpNFX4LfmvI9g!+UTP zH{zRkFFKgO571oiEV8)$5pKrwtS|Qwe3_f@4c<@wEpEZzkY?MzBfGVWxD_23z^xd> zZ5YDsSdTmKR(uH8pjkME^v;gsBiM?MVp}yqH$gl0V1X1$u`FRZ!MWm<3jK3UOoeoK zQFx#sQ_I%lxw!Jbb93he#a!nqBNw@D_Bm3SXT?me;qwV~bL?hK6DF|;pTY(BG%mz` zk-?jhj@Zk&S&|~)nUG;@Li6kH!h_`WR1P66k;6z!GUM&|0y(V_T4-;@5j1Uo5zTcq zGAA>iSZJ|r+Dwb>SMXIck1u(i;B|Zr&)`vf52@K7;~V%r@(kFQ@g$n|ox;Cy9IxR4 z%;TrOh50yv1vrUicp7iSDO1mMWtsE-;(h9!+D#sn)|+| zE8_0b(m5s<$7+*A`53Q0kv@g~tjn+0znm*O^KkIH%o3sPDznL>L z=iWJ&i|*20SMvO&4UIJ&N<{9opWb%2Yg*HV#piEDm-FgWr!GS>g( z%T3AsMIV2?YR1+2OUjqw4UjH z9?(AH6bTpmOdwpT{AOmjO0}Eqp;^nK%&1nShgJ6>)l;R?^PHX~PS5Zp<)nA0^cB_H z=cIoSal|WesV%nmd*roOk&sQBZTTYJ0y7pa4-9bMURK!I0;3`q0^bs8A#$Cy%gk4i zbU?GbiO6Ci>rBJcS(!DO$V4Im_u$lxPU<`feq*Tk31)85qXc!69R4c7p}wS*nuS+ zeLecbZ~CSu&AsW-)NBdJcq!&2MCEr@j#{YuYYRu~B!lg4#Bs>^br3gW z6;g-pnYe@TtGE-XyL^iEXpcbMWDnzHiVpS36qWjD&YM}l`KYF7&L^7R4cLvJU=P}T z)KT}NO=ysM>H$26r?3xC;{iO2hww5UMqA4xxaKw;HtUCY5}zWQ^&iOQObrLo*6=g* za&%slE<}=QAD+WW_&Jv0kh^_GzdF9+*0tKUd^KR!)|IOPcURpx=lF8}iP}PU+UpDR zik|YPYulE5TkEzK^B`Vfj>VTdnQlDQ*=%P&u$N358_D&QbT->~G#8MU=QyBz^dEEf z>&Sf>@~?gNt>@l2pT<9viZ|jBnUGDYq-t(1Kj-zvn6sagZFbIlm#9z)^GhQ5KmYFL OlsZoDI5+<0i05x;DFYq= diff --git a/LemmaGenSockets/LemmaGenSockets/obj/Debug/LemmaGenSockets.csprojResolveAssemblyReference.cache b/LemmaGenSockets/LemmaGenSockets/obj/Debug/LemmaGenSockets.csprojResolveAssemblyReference.cache index 08e454dd60c5e8bc92e0955212eca78ceee26490..58d7cf59276ad0e10d9e5aee84f9d2140ef4dd42 100644 GIT binary patch delta 304 zcmey9dMIr}CnKZ4<}OAiHkN8_PTS2B*zFiKK`f7p9$Z{pY^n?l3|wU3TbH5X&t=58@b zM#kLDOT`r#8EZBlmr!Ek;g16u4g^e$36meFN^EA9Z)9U!Ik{Cel8ZkVBE`V8`JSo> z6XV9spVg<6GJ-E2I*i;!97`Vzz^bGWj4U7#f zxfsQO0>VJd08%vhJBK1z-((q12xGG&ryV22sAeuGYw}TUsH#^`#%6w=YA(jW&D~;> zj6iCsxFU!;E}_K6#F#L-&q#bTk9-pwW8ma|)kr4Be6R?&+Ha5|c8%#QK#BP}K#4lA zgtV?5P-JtIAuFRK<0_!bf`OO?hT7-=yvUIOW5 GU;mw3^;CYpf2cP-P?=`=f z`R(3t`*8b*d-)%?N^ehOcS^^4*ddp(CjFIGAmOHX)?LGz*oa$Xh2(`9P|q?VhmOWZ zpd$ciG&2ChEk;_)D|na{dYJTp1(BZw3^+A_&o(S0>xHn*_Bmu8jFZkcSWEPq`6lUA zNAeUipQ(_iKa!0+1r3t11Egc0ivBxrqF+TxIsLIkQ=@zm=SwkLM z#w3HgRip~@DNFh(G6p=!`%~#G8%PeuYZ8NWBQmiiyv}42Ntonwrbwy%wA`^J96D;+ zWz?Tph$IB?9-R~OS-Rn*rNwKen`ty+me--wzBFBjyrGq}rjsSP`eA{q!_km2P-FWu zr_ES}-Dyz)vA7a!!;CxANMn!X<(6TYX^px?WmMT6x@CD&y5(`G?R--#J?>DiQTe(v zQ;xJupuJ;sbkq*Q3YyL=Ql@9ShxLu~NL;3zquMW!H?)L)gXlz)W z!wAV?DKA~!wTMPRS?9Tnjk#>LTQnvYPH6@vbIIFyORK^qStFv%Pnv^@I~4<5YA!4J zZ^euv*nZMqgJiGLEMXsW13Sb$q95(FjWt@E76KzV0$c185u8{2MAR3AfqLy8kD*Vr zX>~YGG|)*J1K)DVg~SN%i+Tz>1IZJL7bt~+-TXRfnzVY758)p5GeGS_52K&l`Vy^G zbUvE>!sSe(Ei&vDGg*OQgEp7NFcr_T6-gOb#J^Mm!x=T(6W^+4H^*&5Sil(8sWpn| zqnU>9s3(Cvc!$c>ai6*pD8bN{F&%cJvZ*Wv5^7e8@6w)U^yvU|8$7jzyP6kUHAdeJ zxmGRFhh}0JGZg12b}Du$u2fv3*hj2JzmhjAZd3f8Scd>|NE2n|qGABI#XMZWQL%v7 zA-W1QJm4#cm-tRZl~ZDzIm{!COcBL0z7GkEXfpUbc?R%1KS&!L;GdGl&_BZwT*i+$ zMzQB`l6aB04p)hzoDr{xUy0rPCcG?;JH)T~2nw=_e<;Ov&e;{5<)&hgdD#su!cL+G zHz}+|i<5kjuC9oaRBkOhib-r5_1G$+jkUxH*iLM~8^jFu6K5&TRa~g_&*3oXb4s&T zauL?!OH#dt>%>9)$y(8kFVM)6wA8y0!?K$ypH@LHQiYJZbnzxjpVY z)}|HlBrbJ3eKtE&UhTWa_1sWjeyH3MoXP{vK=0zsTl!w_AKC`@g{qWcZo&rix?8HA z3CGrd5E@wC)p5M3@!VGvUOMM~Q`OzmWL8y27V}VbWI1~hq52RHan3^okz_pN``=Yp z0iGo1Ho`VDlFVi{$SV}m!Fl=R@JB3r<5si$u*g3-BAN)?0aoP}<#zq>bLzLY zaL17cS2vbUe7?7wjCfe&?D(IjjJhV}WGNM67u=P0S2+}Yp!2_Gl&!kEtki1$2enT! Aga7~l delta 1814 zcmZ9MU2IfE6vzKFclPe??RL9&Z@b-=B3s(hExSOeLQ230LJN(N6c!|4qai#PFy)rE ziBgE$8bO0WU9SX16C@!q#E*!5X+$4TK}-k_5(A>4F=A4q1iv2q3izKXR&aLnoBy11 zX6BsPnH_E)Zhzq!+0j|rcZV1%9orzXfe_81pXLE|LA}fe8%47i4n{;V(=#8Yts*5t zxY7W`Vt~bw4cLM3T1UHOLKLfoj)04SYQS*@(qvhABFP5?IoSBlDn9T@+-Ad zx$3#C>FdIaxRLxra8>G%7uB3hXgXt3^p?40`D(~r%tl%M9kjYppXobZ^mePNJ~U}( zKtIo?&vmj@I(sjiyL|y=;({?%UtGYrY=%LxNapw1z(759dAho8V5Xj_m{!#10XY;i z9KTVk_Ay*OuuY*c$?xv_BS7Qde}6H;j-JUg8B^R#%A>q^&l!&!^Jwpr1v?(+w7c%UIS z;?Jd=<%ZHe?+?=t`K`L=HnH}lH*VbU%HfuWeIAsPZ3`}kw%r|L2@OvGkF)<<>%w6F z*6oiB=xPU}z&iE~>kBwTozQya`mPo1FLHcWu*Y63+JcjIQ}Hd!Mw8V15kpo5&gvXd zDL~7shPw@8Qd=$>{j*`t5Mn>=m!bKvv2==q(ng<}Aj{CsGFDqTD-J=OL_|HUlK9y0 zMU^Zn8%b+IrtpBuS+zJ!+E_@7jSr>fax#glD#>Xlp!tsBDK25-N%;jWbykx4HB9ir z0=yM$f*)P_Vyj6ln;Kn^dStOgfyY#fC<<({+C&Pou~2kp^u!VT!W0Bfn9<&JuNh6; z^bKN>5a=;`j9@cI7KVBKfW3H@>4q@I8wix}m%K-WaNdk2SF!Q+q%S>SbqGFvh+s>k zrg+7QqSq39_j<1y(m@NEzd~fN%b28up_TW7En-t_Zb%|Uj#}O52d`G@7e=uA9N)E|C$WP>TLnWf(D=R9FV*|!HYo=8} z{Wb*5NzL5VrPMe_HgC{TJ)NCV` z+!}Jz_3r2q=dv8#SKcUt6BU`@+lmE=@wLnL%zpj!>i4cL?Jb<0*&JjO>#W9bEGQ;w z+NQ^)jLUdjI#tOI8Lvv-FXAD?NJdHz2~Vhuka0UBJmGcuHA3M`uIhq2bbKU8+{@7xhtblXNT{s zQk}ZSALM(j|FG)S7yRppE6UfaUgxp$q^savclJUFeW^0+X(3DKir|^zC%Te*yZ4Cr zCFJ$!4}-xQ;*34OPOHHZHF#AGJ*!e< zDs|pkR;H|>Nh?)irT$XGQMJ!&4R>1mLt;suM5NYhwox=kkA}*Y#;oiL()@VK&g7o1 zo}E1po8dBXdDHA-Jdb^`g_!<2bgFooN-vc(GoI5URjJ}Jk4QI_HB^r3s_K=QIb&4j zQE7ELtG8PNjVv-vs`yKo-@~z-Pd#;WG>F)yd>hWiE5BZ|ug564xyn{YAS zi7S!CZN{V-*Wp^+gfVg!=4UgpNFX4LfmvI9g!+UTP zH{zRkFFKgO571oiEV8)$5pKrwtS|Qwe3_f@4c<@wEpEZzkY?MzBfGVWxD_23z^xd> zZ5YDsSdTmKR(uH8pjkME^v;gsBiM?MVp}yqH$gl0V1X1$u`FRZ!MWm<3jK3UOoeoK zQFx#sQ_I%lxw!Jbb93he#a!nqBNw@D_Bm3SXT?me;qwV~bL?hK6DF|;pTY(BG%mz` zk-?jhj@Zk&S&|~)nUG;@Li6kH!h_`WR1P66k;6z!GUM&|0y(V_T4-;@5j1Uo5zTcq zGAA>iSZJ|r+Dwb>SMXIck1u(i;B|Zr&)`vf52@K7;~V%r@(kFQ@g$n|ox;Cy9IxR4 z%;TrOh50yv1vrUicp7iSDO1mMWtsE-;(h9!+D#sn)|+| zE8_0b(m5s<$7+*A`53Q0kv@g~tjn+0znm*O^KkIH%o3sPDznL>L z=iWJ&i|*20SMvO&4UIJ&N<{9opWb%2Yg*HV#piEDm-FgWr!GS>g( z%T3AsMIV2?YR1+2OUjqw4UjH z9?(AH6bTpmOdwpT{AOmjO0}Eqp;^nK%&1nShgJ6>)l;R?^PHX~PS5Zp<)nA0^cB_H z=cIoSal|WesV%nmd*roOk&sQBZTTYJ0y7pa4-9bMURK!I0;3`q0^bs8A#$Cy%gk4i zbU?GbiO6Ci>rBJcS(!DO$V4Im_u$lxPU<`feq*Tk31)85qXc!69R4c7p}wS*nuS+ zeLecbZ~CSu&AsW-)NBdJcq!&2MCEr@j#{YuYYRu~B!lg4#Bs>^br3gW z6;g-pnYe@TtGE-XyL^iEXpcbMWDnzHiVpS36qWjD&YM}l`KYF7&L^7R4cLvJU=P}T z)KT}NO=ysM>H$26r?3xC;{iO2hww5UMqA4xxaKw;HtUCY5}zWQ^&iOQObrLo*6=g* za&%slE<}=QAD+WW_&Jv0kh^_GzdF9+*0tKUd^KR!)|IOPcURpx=lF8}iP}PU+UpDR zik|YPYulE5TkEzK^B`Vfj>VTdnQlDQ*=%P&u$N358_D&QbT->~G#8MU=QyBz^dEEf z>&Sf>@~?gNt>@l2pT<9viZ|jBnUGDYq-t(1Kj-zvn6sagZFbIlm#9z)^GhQ5KmYFL OlsZoDI5+<0i05x;DFYq= diff --git a/tests/lemmatizer-test/.gitignore b/tests/lemmatizer-test/.gitignore new file mode 100644 index 0000000..b87b494 --- /dev/null +++ b/tests/lemmatizer-test/.gitignore @@ -0,0 +1,2 @@ +differences.log +corpora/ diff --git a/tests/lemmatizer-test/test.sh b/tests/lemmatizer-test/test.sh new file mode 100755 index 0000000..1e4aa0b --- /dev/null +++ b/tests/lemmatizer-test/test.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +./test_corpus.py corpora/A_en.txt en >> differences.log +./test_corpus.py corpora/B_en.txt en >> differences.log +./test_corpus.py corpora/C_en.txt en >> differences.log +./test_corpus.py corpora/D_en.txt en >> differences.log +./test_corpus.py corpora/A_fr.txt fr >> differences.log +./test_corpus.py corpora/B_fr.txt fr >> differences.log +./test_corpus.py corpora/C_fr.txt fr >> differences.log +./test_corpus.py corpora/D_fr.txt fr >> differences.log diff --git a/tests/lemmatizer-test/test_corpus.py b/tests/lemmatizer-test/test_corpus.py new file mode 100755 index 0000000..8c986bb --- /dev/null +++ b/tests/lemmatizer-test/test_corpus.py @@ -0,0 +1,36 @@ +#!/usr/bin/python3 + +import unittest +import json +import requests +import sys + + + +def lemmatizeSentence(lang, sentence): + data = { + 'operation': 'lemmatize', + 'languageCode':lang, + 'sentence':sentence + } + + address = 'http://localhost:8800' + response = requests.post(address, data=json.dumps(data)) + return response.json()['lemmatizedSentence'] + +corpus_file_path = sys.argv[1] +lang = sys.argv[2] + + +line_count = 0 +with open(corpus_file_path) as corpus_file: + for line in corpus_file: + line_count += 1 + orig = line.rstrip() + lemmatized = lemmatizeSentence(lang,orig) + if len(orig.split()) != len(lemmatized.split()): + print("Different length in:") + print(orig) + print(lemmatized) + if line_count % 1000 == 0: + sys.stderr.write("Done %d lines\n" % line_count) diff --git a/tests/lemmatizer-test/tokenize.sh b/tests/lemmatizer-test/tokenize.sh new file mode 100755 index 0000000..442a2f8 --- /dev/null +++ b/tests/lemmatizer-test/tokenize.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +for corpus_file in `ls /mnt/storage/rjawor_storage/copycat_corpus/cleaned/*txt` +do + a=`basename $corpus_file` + concordia-sentence-tokenizer -c /home/rjawor/concordia-server/concordia.cfg < $corpus_file > corpora/$a +done