From e558cb05d846e348b146cd25b18dfcd7ab5ea72c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20Jaworski?= Date: Sun, 5 Mar 2017 22:45:11 +0100 Subject: [PATCH] lemmatizer facade --- .../LemmaGenSockets/LemmatizerListener.cs | 24 ++++++++--- .../bin/Debug/LemmaGenSockets.exe | Bin 6656 -> 7168 bytes .../bin/Debug/LemmaGenSockets.pdb | Bin 15872 -> 15872 bytes ...ckets.csprojResolveAssemblyReference.cache | Bin 13598 -> 22377 bytes .../obj/Debug/LemmaGenSockets.exe | Bin 6656 -> 7168 bytes .../obj/Debug/LemmaGenSockets.pdb | Bin 15872 -> 15872 bytes concordia-server/concordia_server.cpp | 6 +-- concordia-server/concordia_server.hpp | 4 ++ concordia-server/config.hpp.in | 1 + concordia-server/lemmatizer_facade.cpp | 30 +++++++++++++ concordia-server/lemmatizer_facade.hpp | 25 +++++++++++ concordia-server/socket_lemmatizer.cpp | 40 +++++++++++------- concordia-server/socket_lemmatizer.hpp | 18 +++++--- 13 files changed, 115 insertions(+), 33 deletions(-) create mode 100644 concordia-server/lemmatizer_facade.cpp create mode 100644 concordia-server/lemmatizer_facade.hpp diff --git a/LemmaGenSockets/LemmaGenSockets/LemmatizerListener.cs b/LemmaGenSockets/LemmaGenSockets/LemmatizerListener.cs index 9eee9bf..9e525d8 100644 --- a/LemmaGenSockets/LemmaGenSockets/LemmatizerListener.cs +++ b/LemmaGenSockets/LemmaGenSockets/LemmatizerListener.cs @@ -20,6 +20,7 @@ namespace LemmaGenSockets { lemmatizersDict.Add("pl", new LemmatizerPrebuiltCompact(LemmaSharp.LanguagePrebuilt.Polish)); lemmatizersDict.Add("en", new LemmatizerPrebuiltCompact(LemmaSharp.LanguagePrebuilt.English)); + lemmatizersDict.Add("hr", new LemmatizerPrebuiltCompact(LemmaSharp.LanguagePrebuilt.Serbian)); } public LemmatizerListener() @@ -29,15 +30,24 @@ namespace LemmaGenSockets private string lemmatizeSentence(string languageCode, string sentence) { - string[] tokens = sentence.Split(null); - - string result = ""; - foreach (string token in tokens) + if (lemmatizersDict.ContainsKey(languageCode)) { - result += lemmatizeWord(languageCode, token) + " "; - } + string[] tokens = sentence.Split(null); - return result.Trim(); + string result = ""; + foreach (string token in tokens) + { + result += lemmatizeWord(languageCode, token) + " "; + } + + return result.Trim(); + } + else + { + //if we can not lemmatize, let's not do it at all + //primum non nocere + return sentence; + } } private string lemmatizeWord(string languageCode, string word) diff --git a/LemmaGenSockets/LemmaGenSockets/bin/Debug/LemmaGenSockets.exe b/LemmaGenSockets/LemmaGenSockets/bin/Debug/LemmaGenSockets.exe index 8cb06db4b0eb48c3f7c0ddf498dd6a48fc8bd83a..aee4031f5220f4133e1fc230bfc521a379d06575 100644 GIT binary patch delta 1359 zcmY*ZU2GIp6#nk~?d)!M?d)uK7wFHnTj@>Tr zNQiEE5dTtjbBVD{G*o%1F-mMeG?YaAeLy2hNQlPp;tMh1p(gr3z;C97#7XA6=X~F} z=bn4#%$?aX)A{zRO>ew>W&mr?6*M6zvOt7{xk}b&D<3^3&PyR$^rLM+t!z|U<)CN@ zG*>es$u}Zj7a0+ir$mcbFaHotNic`|^zv1pV@C?u7X(y#24Khw)!8aLt9k)$pxmRh zhm=DC(25~ovLPg=m98@EtcEe_6fCUjs$&CzfYI6qE$T#R2a9ATtKbX8Y%P=WTiU#> z!&FpDPitJQSVK8Y2vot+;sds3Y5q6!9hXceM$$Gi1~eipV>8!s>w1Nw0)|6V4k@h4 ziOPC4KD3URoH!@NZU{F`Yt8Md5@bz>0bJ!)r7ma0=RG!L7V9|M?-gR}nfU*rG-sWx zsLka@x777+5;OTPt0`@u9ZJM&OSLp?jT`Vg%oCu|5OC^UtMfFgrSCMjA$l~J>E?Ig z&SyDU1T5WflCI+SrI|ip#4NwlL?g|w5(xT=_qdyCrd@MGZqcHHw|!@$7Z6K$hHdx~ z&L%SUY97ThEMHo657?dIWpi3w-L!Pebka08S$e{`lLjo^*(?{lW9G`O@=I;}-t(NC zdI{)ctv~Zk<)(`H!ZRgzJEkAl!7xZN*1JTjd{6&YoRcF) ze|d#n(y3lYjGKqpO#lw#d(|L+-1Ckk`aJLAnl=tq2=ww$+{nl2srcRVjv%iXY}_c1 z2&mw);ukvFJ#RIOfzDMeu}8f_u&L!%-)rS6oYJ=A0t(trVnEwP{88CW{8)LC{j{u% zFx#WbIOcGUUABNebry@nqr|;3>Zf%1cg zKT-Wc{S}L4JiG+<-1(3O&RO28nlj*yUlj z$M@kW@;Q%r!ov|9AoCm+h$Vc&%VHlK{K+HNW_Sjt&3g4JHdY#K`Ii|HSLIE!W2xQ$ zxt8g63RA=RLaA@;AnqR>Esm8+4-W6oPaFjFNlXyOP#`{wqP!Wpi#JeVE?W8L*vihi zZ!RwXR{eBNZVzW8L7z{l3y15%W;m=Tg<@mrK=`b9;MO(9f0erfPVlNa!JgTh>&Xp$ zwe7q6MovCFdnr-e`q|kuWsGkd8g)jd_m35(N<+`*OVh&>$&!4r`dZ}gcFJo{*uBS+ imBzTp%6F_6mXe8MVksP1(ft&vTrIm~C9a7|*7qNb%Oj>Fsqz$st(v+8mmQudKQU^t2 zeCZK-r1)2;w?$S|szuQ#e05heq-I-6lb`1xVE}2MEdglMOu$sz+?=?f24stxm$`8- zr=yFm0&t;M1s)KOkCp?41kAEzc{XL)GoFr^)LcEIGr%aJAOvIL3qjV|QQ9fL~<{>XV@YS-mdAv5v_|Y(km0DwV#SN6p z{hGQWEphJ7)$O2;B85%yU}Gq?f5Ggj2akm=MpYuaixp=9PIHckyM`HT3N3allj3M5 zT|20!kZB+!Vxv5ULcrR`tJl?-UUl$)J=h#MB>hP6%$Of+A>)NO7xc_(6sOYaKkUe%3vcn~G<$>Ti8`#}X%}-vD^9SmK|~kBttE94Us| zxzEmO=a}xxqscBcTY67lS0}k=sIRH>Mu*6%JI1$SQO%oueo|TuQn2W@Id6hUPzlrcBtyi?~=)V_pxH!(svqI%6@6ROO zq%6f?vkF7N@^ ziMaW*=Z<1E~Eu8Z(Rc<|)NP!>QBpOqdyIlcML{LUl49(~fP z+kN51AAkAs)U|CiV|e>OXE1o;_)8;W#ew}p#S@2*rSj@d#cJjIY|jVZ;T;LL*P%4UuA+aP$X5mXOmaR*O?>8pV<`BB4DXCK@ z?NH4Pw^}V@t5x3GYOBybG_^0bS%b|j*Jm~8W%(hf-*dST`JprZ_xHS=bDrlp=l{p_ zn4XW*vb-nyzI;k6rM6$J{dR5JXB$ra=us**H|Id>`0saKw>jR; zX@72PF^L`C3tD&E5`O`|js6q*xc%PWj2sF)h8zo=(@i!vxJEZ6HU;Ntl-5vl_*!bS z_NhISzjKswb|FdbcD!Pt{VC+CJVKxZO*`_`9QO?(4EAR1eW?k46?SVo5ejIB_2mbA zcZlgBCWL=*R!q9xm|u|YP)@bM9`IL{1o1iM>)qPConeMuW>}lyJmyM7wOZ`Pt!~<{ z@;&9Fsa4ee_E@a7YfpFklO3Hql#+}_*y%0uKT6D~oyafEo{u)j^Ej4qyR6_H-#RBE z!!PxvMmi-&?X7}9_874@P{kXyx#4r!Wob&ikL7b>-?Nv)hqJF^c^At=SU$De3agSa z*Rd?bveTX^Jnh?>T3OS^rj$&8YO)PQ#U<;Jyfn|HvQ)O(%d;=}i2f_SXfm?5Xo=5H zR54lrMj(AMOJK-eEGmfqg*0^{I(a&%$sEQ+V1$X5G7Di2y#33hkyaL=FMzCU*g=)R zFuWHMnO4-egeQHjja;dpf-5S9;YCkLv#H(3e>Rkn; z-Wr$-rEdDIR-^l1EtDHc63KeRPipD`^cAoU${DSNa(eYp%6JgQASZ8Lfb_r|fe%B< zvg;#_I%-cv0`W(%NG3GJ+yh&n2X2CLp4^XDD&f=U(%EMqSu2`m8lYsvsw#>unb17L z8mbLSgT*g*Y!|xR?cFd6_rSe!GW?i0hJElflnEB@M<0NcVcvip@J+ZA4#ETQBlsfx z0(QY~;34=k?1eYs%kU4#9%cd>{wCpb(5Wf$FhV9Rpg+b1`=GqhuRP(AXX)aC#m;U>G%xLNC*wxt@ zvu;l$adP>Hb~QO=x>5r+wqhtgbWmhD%hX7DW#gfPVy~F?@GY1Y#YY2X-Yvxc_~cu3 zbUOGhUz8eWzA;?1SrfPPc2n`Z)I{QDhE`j8kK@aj<+E2;26ex^wbJuH|5B>G4oBwP KM0L%T^nU=M4>i94 delta 1564 zcmaKsYfPI}7=X{|N} zmmkY=7EKH$CI*R7S{o{PfWaD|^h4D_&Op*17(7Hat(R21>U~ zwq4u!b~X0L`UmarN@vQ(9#+b%tlp4!9-io*&Su7^@~Ph8{B(LO`mmZyHmhDw|G&GA zsv7U_j<(m9)6UDP+;`I6b)dA*9Pq75*{QY#BK&p)UY0%TtH3ej+TcUT+2Eo~sG86L znJ`a;DkMs}_E_{vv6QYtMb_^YAsi_r-I=->uTtODI(H5csDm~g0THd#(F9r4^3=p= zW<0OT!ya{Cc&q<=d;@F?;mQ?4h2CaYnQ%lNQj>vXSX66=Umh*1h#i9|D0c7gZ>RTsI-eI3iwSbDImsGi81{-^_y70{IzR|(C;Rb@2j zUL*EnRPBwcebGg?-7ds=EX%~^)Q#vl_c9g(%l%jk^<3RvD`pu>6_%u0sk`VuQp_yr zJyDd|CJw4heS_aAT*56zct(Ymf5X$qlTh!~@ACUe+kh5;F&KoqU`XApkIb$j73M3g zS)+<_o+T}?B1G%fqR63yn0>NR+3VO>sOzcsSNgS(W0jMQv*!wHy0p!(U@tQ2tnOmOWOyUA^P9^tZ(Zg$pTOoZKUPxB!N{$LKN0p)*ohro%*alBR_E;6^+4Q38N=`#!#TiJ9 zI081rxtz7L1tRY)@9 z6F3QffO&WwPQgFnH2e!b1#dx$F&vUReg>h85uAli_$>6m8R&=4n=RWvmzTQ7K5UGY z)apN+6l0leHa#_bHuK0tZh*VrQAtLzq}rRja!s9S3d%Key6JVBYyL&;(8~-Z{_~?2 z?aoaM=F-{S{dqC6KlRTqpE;NOQ95IeLaGNw3qOzi1?<$kVMy} zvVd}-s2qvY(d{3CLKfZA7cX_vTSd{J_1k?t_=-nbvFgi~kW8r?E#Ck6+_(B{wt1J? Jde3Jie*?03GM74O5zS_z?LM- zIBQPU?3}&tdG7O_bN1Qi1VkVZcn^X985)Bim#$AG>|8!K=)_hPQt4!DvzgzOk9Fm2 zYktcX%g(2A+2!qvVr~4J-_lh`I|bWXp0x^&ZKmh9^cDuvsl-Lr-p#pPR(AP7+rq() z!Op?<_GCw!xzG%;vNNjJ(YH76SeZV@bS$RQ2UBTFax+dD|Abbg(^kSk6Y{YhGe7K@ z18Iw`)@OidBp;YGf&|98oSE!46Hd;K6#LWAmCKBnwq?&^p_+q^xD<{M%IxHO*pY+Ivu8BQeDef&X8?pTDqb7e9PQDOBSEozsVZ3Y%80v zT2|)sRtCds<@>!6?C(Z;Yj@7x)gRliW^;)fT@>qxEhr_7b;;Ugbq%P_oRaD?`9#i6 zrv_v-I;u%bNj1f>-I__uiVReA5ymL#jnU}%80C*{jP_;Bvt)tCpupZqD3E9r zAu9WSubf~o2<5Qhfy?LV%X?nvF$^O-UpAJu;=;Ct?MsaiK`~t$`|y3v9Yqo z3QjW4lb^q_^}6Lvz)dR{y^W+DB)kAb#f{*EWQOZEwu}R9AsBrTNe4)HF^HNW#K}8d z)zK*%TLPNkBE#N2IJSAbcT2$NOG%c2gg*h|>m3)Q>Ds&VWn)Xd3$;w+#x5^wtl$Fg z8;VatpkF|;0wlZ=L>&roCIrmjErF}R=v^eMLBeZ5d>OcLFoTEvZZP_VBt0PEwIFJS z5I33WI@q6*jV%qf;AY6)c@y;_F#0-@^&sI5Aimyl!!)fs|H7dneTJ#RCcH_ zmG4VsM$#57ma~V=JkIk8E1%EVEA7N^%7Kdj&YHE^?g9^HOr)nWnSykqG}>0i92v2a z-F7ZBgAN$&v|?Z30Vuj%e0Giy;*jvvQS!{d{b1Rtg!IthAh@G_+qR=Ig66lBr=DCF z)b{NhN7^d-CXjF+NU!vpRCAA%oL+K-kHqJdEu_;deKSaerdxp8q;pC;s*#pW{4}!Z zTSzVeX{Hg#Ah^Fv+i)u);m?4qUdOXislAJ0?F(WHmn^~m_62Q6;Hz`n{aF+iq3Iyo z%~pJfc5g#Ay&t5R_6LKYg&~`~9g*;*AS+91FSYxq^&Ce@5n3)2(en;u*G-ZEkY?II z41$J(Y;*#Va1vx~snPDicLI&KP-29J$wZVri0t|hNHeV)2EkocwtEzzWd^xh~ z`$?_5RB{H4!aE}TbHanOaD%d9PLEH@)z6>|I*Vot&m+P#` zoxh@M=O1Z%itYDlph^X*MI+S{3}1=SJF#-PE&3S^{VIItFMI%Gi1$UyT3cIpE^GDN zcCD3{C9UpdNvkZmbXnr@&*%h<42Xr=dritKyQcq01aG6PSeoEznSL_Nd~| zlW@cBAT6N=+KvqR)#LvIf)<)wI-hhGpxGz8cN-#78Mh-MXb4L27hUm}D5fDOd;MjKOB3DDzXE~m z`W=XDoy4yD9P;{|C|wMoPn><)0WfBf!3Xqz2RN_)aWJL+$C042|2rU+9sN4<^r_4?FA`?gX|G+1TFA7Xy|^zVQOny^xQ$Q3_K@i2(g$0LYT z)dzJU_3>RqRDDePMj-X^J=7w)Reh}S>0`C3k1n4+b|R~%4{r^jAuK!o7$Q<1k0T<| zuJ{R8{3D8K8cTir7?BU!AD_f`sg|b@QTNA*E+F14a$!>3lBIT@MrC5Ds+}c1?R2Wz zS?tpe&2mpW-l9S~TWaSSM5K0}Wfio%rT95l{5-|BEB>h~et}}z;!0wq zp}N9O6gS=q`x5IuPgTabK4r{Rl`+Ssi~_QH%J85<<6ZXpSBOYuyv!}K8u1pZe;h{JR=Hiba?*$F(7;A^8zmSfsOAb8b83pgNXatMuvMXjB4g7tF_ zPKm{H&RZDZ58T%b{jWqqRGB)WNns2zQ=LbR#d(B0*qsl8R?H)fW@#N%?t``(JUqXR z0d8>L!x{QJV7crW`aeNML_v=TD~usRDwH%Il&(Y8N+|ItQ-@LwL+M@Orke3!==AG} z_ueh}FT}eFV~Br)^Nt4p0>cv&io?2e184=9qCE661jQ|&+@84~H{v%x8VEc-57p5y z{-R(RyMl&&j=prChYOve30qjRW^ zwHn6S@c@;wY@OhtZ8p*NM+IQw4+>+5->Yc*oeynvBvznJqeHO{r5c9PX+Vs(Y@O}_ zsf9p#O~Hec?DqNIfXIAvntx2 z@u7_uM3vj$p*oaY*Hvvk0a*;h_*@U29+;L8Og~ZZBc4_mLp-H|=}8}$9PXbEHf^~t1u@%JFV+yRq4;97`KTt9DeILdi2WY$n%e}tSo~T2q zwim2!AjY>~(O)(0v+5z39#!xo9#I%WJgkE0kPl3}_pbm`l=u4#LGd)`n2h_0_6+#? z(pAGt*9L&9>fPbR_%M#UUgGXS1#05k3S)?GsknQrGuxJO|OakmPkyL?dMx+=gVUV?NCzw#SHjc=vuD63(V4FXEaf;Hq}Y?v6kU4fOjO<@di ztBSEN_%OzIbsok%1D@~e>QJg-C}n{d&*7f)fHXoN-K=0o+@vsuxKRbt4L%_8C1wSX z1Ybsu;a48C*CDARtcDRb3MeUy)ou@Cdx)`X6j+I?6~+*sS26ZEAIA6ywgO`sAH&w6 zRKrlZ5{U5@tE)U99UzddQm`YgR2V~Cp#o{Y4@i7AT?wSabs*KY+v;i{#=RXZd8`f+ zN4pj1h*5ZwS|#~c2ne8f>TDX-`KNn748DQ{pIf?{ql zN%^5A%4;)!X}mrgV&ErSK%bw5(+RpL4YdYi3F;+6jfZF0#85{RRn$jyu%B>0KTvHHly7F%eD%kQg6+- zeR_MkPj`QPPWRcEh{a<6LgF`5R~+!-IlJJwzPk`)PF8HEn7L&6D}H9u^<tGJX8?n3EMJsCaU`EGvO$IX&iFW!|w1XXW}OcSV-R=lcd1h8IQ_`umH+eb%6r zU}ulF?<2eJ2eOn6tU$7szF<32Ngak)OP5ITh*0VjUf^ zL}E{>U0*ZxwvM#Xd8VcPe_R-SZXdNWJ_^(|LISXlxxILjNkD#iEO*K+R7$cO__@or zU$GoB8&rz6YYz1F59groGA72)&B{{AdO?=6Zec}+z=N!7dRE`{!_E$Oc07hKP==T$d4x+!c)H?_6BQgT#B20B{X%8pK1 zft4X-WpMgtAj{l_zUq4q`p$1<-{eOZIw@{tCu)G~iJZT(6?9jV)s5EAK7`h9m8`?6 z!896N+RBL`#b)jDr&VvX6mt(jeHw}8Zrs1sV>mB^%ybZ}sxcfuZ(B+wx+?axk9L%F z(%(aPFX0iw`v~tR93&hfe1Pyl!lQ%_5gsFa7?5BCM;j(D;uu3t>nN?Ja#*z4U5@1k z=iH*bV9R2cRW+UK=V2_nigHz!i!iCMq+Nc&3IgdVi`(s0%7I;yHNUiXp>A9GT1?ZD zBEGk0qaIvHSu@*#G_H#N%;H4i%5Rd{iHwes|h1jN}oeEKe(geS`hSjRXm>U6thtz<}Fq zC`+PlS7kr8QCV)N?W*h(8%9&xlvgfsO_rkQyY{OG^8|_KYry>)qFe+K|f09 z0VJORgw2Pb(d7$&`TLbO#=Bu}G7$7$LLVU64+y`Ipn=}5(=niwZEj&fN5av4>4I6E zjzI|e5MdaQJO+r2j!tL0j&4LL+Z+oe)PU4pwjS+Z7FWpT=LOC7vEYme2&iu+sb(i}M>2qAth z8q?9dSL5<1J-eXp*SKxm=)ZPtGY5zsb7Sp?8$h^zo$E+Hua5ze&jRMut+(Cfwi&We zD|{~zi*m(r*!l@TCyzZb&ZdelY-**IisN`y|2W|}zE6Jyz-NHO+N+LPs`0DXq!zu`e{IN3UF$cccG@LLz(`8%;4xKzWWFI z?jb@{pYIpYSSQW?gQz@>*Yz2|e)|i~a zpK`24Kdl;SzArjEW1>T)&Uif4U9&qiEruIrsJvh=Ry>OlzPho(9jTsOk0^V8Zyj#6 ziY-Gsaj*K+^pDIyn-0R(UQJItIfvCd)a2oYn4F}?GL_SVCIy{hGA7_cE%Gwm!TZ_IiYooPQpNcjHdR^au{HO5Hu`B+N3gRSvHG{y$; zOU%&TM(%+oH`)Uw?Jcx%zSTEf?C-;SgPe*)+SC+ zk5$t_r07?f^effxn4tGES@J0WW1RX`fS^A!DP04o;v6!99?rz0Uk9k-7m*Rq1C*!E zBXgjY@`io`KPawS$lTaUt}BwyqZjd=8ZWgw5WZa#@I%ZH--1M<3P|EjWXLeZ7eTj5 z&M-BniGI>#$r3;f+(t&wbDDVcWq>MPK}OJXs`=n4BcZ&fp_hU9=3Szze5P@snXGBQ^l;?N^OPW?7o7q8WZtlfd|YGnMcgvWm;GJbmc*74JI zn|SnB0cxPHAtUI_O+5N%0jl_O$OyW36OaCRfGYk1GHIH5rnpAayTL7LC`#tfS<9}6 zFT2by^`D%czbMyiUIO@T3W?w0l;9rTj0F&=f3Sf66!@>#MHyJ$A^`uc(O<;`4TR8W z26o!1B))qp!I`dhD)}N7xUGqP9Y1iB8u}YRQ3-7w5rhOo_~Cuz)%FqIfr+^aY}rQ| zoyab!>VK^6z@zS6EO4s+zpSBu4Puo_Uq?=qLhKPvNHD}om`ckLD&_EME0uWg*rn1A zrqVaCz>&klH&IX})0@Z%8RCy9h6FtcwnS^-k3C=_jdWV zgZcJdEO5wleGg)lZQn;uTn`aU4hGZD7&g5Sb6jTil0D(zq@{SphDtcJu5 z{Z|mGMEW&y;&g~R;#5d5#AKLACnH2+NT`KKX-0yW;^H>2?TL%AmR+vxV6MG`1rC|5 z-$1Og?YGE@@eskpiI8B3XTxk8i?EGxxfZr*jL+>-X$Mp3cUa(t+4?<%Dv|zxoEQ#q zM+}7oLkxzAG!P*YBe|_aVnlbBNIRHFf5HOS|EPHv`6`e8jGTBTL>@>Tr zNQiEE5dTtjbBVD{G*o%1F-mMeG?YaAeLy2hNQlPp;tMh1p(gr3z;C97#7XA6=X~F} z=bn4#%$?aX)A{zRO>ew>W&mr?6*M6zvOt7{xk}b&D<3^3&PyR$^rLM+t!z|U<)CN@ zG*>es$u}Zj7a0+ir$mcbFaHotNic`|^zv1pV@C?u7X(y#24Khw)!8aLt9k)$pxmRh zhm=DC(25~ovLPg=m98@EtcEe_6fCUjs$&CzfYI6qE$T#R2a9ATtKbX8Y%P=WTiU#> z!&FpDPitJQSVK8Y2vot+;sds3Y5q6!9hXceM$$Gi1~eipV>8!s>w1Nw0)|6V4k@h4 ziOPC4KD3URoH!@NZU{F`Yt8Md5@bz>0bJ!)r7ma0=RG!L7V9|M?-gR}nfU*rG-sWx zsLka@x777+5;OTPt0`@u9ZJM&OSLp?jT`Vg%oCu|5OC^UtMfFgrSCMjA$l~J>E?Ig z&SyDU1T5WflCI+SrI|ip#4NwlL?g|w5(xT=_qdyCrd@MGZqcHHw|!@$7Z6K$hHdx~ z&L%SUY97ThEMHo657?dIWpi3w-L!Pebka08S$e{`lLjo^*(?{lW9G`O@=I;}-t(NC zdI{)ctv~Zk<)(`H!ZRgzJEkAl!7xZN*1JTjd{6&YoRcF) ze|d#n(y3lYjGKqpO#lw#d(|L+-1Ckk`aJLAnl=tq2=ww$+{nl2srcRVjv%iXY}_c1 z2&mw);ukvFJ#RIOfzDMeu}8f_u&L!%-)rS6oYJ=A0t(trVnEwP{88CW{8)LC{j{u% zFx#WbIOcGUUABNebry@nqr|;3>Zf%1cg zKT-Wc{S}L4JiG+<-1(3O&RO28nlj*yUlj z$M@kW@;Q%r!ov|9AoCm+h$Vc&%VHlK{K+HNW_Sjt&3g4JHdY#K`Ii|HSLIE!W2xQ$ zxt8g63RA=RLaA@;AnqR>Esm8+4-W6oPaFjFNlXyOP#`{wqP!Wpi#JeVE?W8L*vihi zZ!RwXR{eBNZVzW8L7z{l3y15%W;m=Tg<@mrK=`b9;MO(9f0erfPVlNa!JgTh>&Xp$ zwe7q6MovCFdnr-e`q|kuWsGkd8g)jd_m35(N<+`*OVh&>$&!4r`dZ}gcFJo{*uBS+ imBzTp%6F_6mXe8MVksP1(ft&vTrIm~C9a7|*7qNb%Oj>Fsqz$st(v+8mmQudKQU^t2 zeCZK-r1)2;w?$S|szuQ#e05heq-I-6lb`1xVE}2MEdglMOu$sz+?=?f24stxm$`8- zr=yFm0&t;M1s)KOkCp?41kAEzc{XL)GoFr^)LcEIGr%aJAOvIL3qjV|QQ9fL~<{>XV@YS-mdAv5v_|Y(km0DwV#SN6p z{hGQWEphJ7)$O2;B85%yU}Gq?f5Ggj2akm=MpYuaixp=9PIHckyM`HT3N3allj3M5 zT|20!kZB+!Vxv5ULcrR`tJl?-UUl$)J=h#MB>hP6%$Of+A>)NO7xc_(6sOYaKkUe%3vcn~G<$>Ti8`#}X%}-vD^9SmK|~kBttE94Us| zxzEmO=a}xxqscBcTY67lS0}k=sIRH>Mu*6%JI1$SQO%oueo|TuQn2W@Id6hUPzlrcBtyi?~=)V_pxH!(svqI%6@6ROO zq%6f?vkF7N@^ ziMaW*=Z<1E~Eu8Z(Rc<|)NP!>QBpOqdyIlcML{LUl49(~fP z+kN51AAkAs)U|CiV|e>OXE1o;_)8;W#ew}p#S@2*rSj@d#cJjIY|jVZ;T;LL*P%4UuA+aP$X5mXOmaR*O?>8pV<`BB4DXCK@ z?NH4Pw^}V@t5x3GYOBybG_^0bS%b|j*Jm~8W%(hf-*dST`JprZ_xHS=bDrlp=l{p_ zn4XW*vb-nyzI;k6rM6$J{dR5JXB$ra=us**H|Id>`0saKw>jR; zX@72PF^L`C3tD&E5`O`|js6q*xc%PWj2sF)h8zo=(@i!vxJEZ6HU;Ntl-5vl_*!bS z_NhISzjKswb|FdbcD!Pt{VC+CJVKxZO*`_`9QO?(4EAR1eW?k46?SVo5ejIB_2mbA zcZlgBCWL=*R!q9xm|u|YP)@bM9`IL{1o1iM>)qPConeMuW>}lyJmyM7wOZ`Pt!~<{ z@;&9Fsa4ee_E@a7YfpFklO3Hql#+}_*y%0uKT6D~oyafEo{u)j^Ej4qyR6_H-#RBE z!!PxvMmi-&?X7}9_874@P{kXyx#4r!Wob&ikL7b>-?Nv)hqJF^c^At=SU$De3agSa z*Rd?bveTX^Jnh?>T3OS^rj$&8YO)PQ#U<;Jyfn|HvQ)O(%d;=}i2f_SXfm?5Xo=5H zR54lrMj(AMOJK-eEGmfqg*0^{I(a&%$sEQ+V1$X5G7Di2y#33hkyaL=FMzCU*g=)R zFuWHMnO4-egeQHjja;dpf-5S9;YCkLv#H(3e>Rkn; z-Wr$-rEdDIR-^l1EtDHc63KeRPipD`^cAoU${DSNa(eYp%6JgQASZ8Lfb_r|fe%B< zvg;#_I%-cv0`W(%NG3GJ+yh&n2X2CLp4^XDD&f=U(%EMqSu2`m8lYsvsw#>unb17L z8mbLSgT*g*Y!|xR?cFd6_rSe!GW?i0hJElflnEB@M<0NcVcvip@J+ZA4#ETQBlsfx z0(QY~;34=k?1eYs%kU4#9%cd>{wCpb(5Wf$FhV9Rpg+b1`=GqhuRP(AXX)aC#m;U>G%xLNC*wxt@ zvu;l$adP>Hb~QO=x>5r+wqhtgbWmhD%hX7DW#gfPVy~F?@GY1Y#YY2X-Yvxc_~cu3 zbUOGhUz8eWzA;?1SrfPPc2n`Z)I{QDhE`j8kK@aj<+E2;26ex^wbJuH|5B>G4oBwP KM0L%T^nU=M4>i94 delta 1564 zcmaKsYfPI}7=X{|N} zmmkY=7EKH$CI*R7S{o{PfWaD|^h4D_&Op*17(7Hat(R21>U~ zwq4u!b~X0L`UmarN@vQ(9#+b%tlp4!9-io*&Su7^@~Ph8{B(LO`mmZyHmhDw|G&GA zsv7U_j<(m9)6UDP+;`I6b)dA*9Pq75*{QY#BK&p)UY0%TtH3ej+TcUT+2Eo~sG86L znJ`a;DkMs}_E_{vv6QYtMb_^YAsi_r-I=->uTtODI(H5csDm~g0THd#(F9r4^3=p= zW<0OT!ya{Cc&q<=d;@F?;mQ?4h2CaYnQ%lNQj>vXSX66=Umh*1h#i9|D0c7gZ>RTsI-eI3iwSbDImsGi81{-^_y70{IzR|(C;Rb@2j zUL*EnRPBwcebGg?-7ds=EX%~^)Q#vl_c9g(%l%jk^<3RvD`pu>6_%u0sk`VuQp_yr zJyDd|CJw4heS_aAT*56zct(Ymf5X$qlTh!~@ACUe+kh5;F&KoqU`XApkIb$j73M3g zS)+<_o+T}?B1G%fqR63yn0>NR+3VO>sOzcsSNgS(W0jMQv*!wHy0p!(U@tQ2tnOmOWOyUA^P9^tZ(Zg$pTOoZKUPxB!N{$LKN0p)*ohro%*alBR_E;6^+4Q38N=`#!#TiJ9 zI081rxtz7L1tRY)@9 z6F3QffO&WwPQgFnH2e!b1#dx$F&vUReg>h85uAli_$>6m8R&=4n=RWvmzTQ7K5UGY z)apN+6l0leHa#_bHuK0tZh*VrQAtLzq}rRja!s9S3d%Key6JVBYyL&;(8~-Z{_~?2 z?aoaM=F-{S{dqC6KlRTqpE;NOQ95IeLaGNw3qOzi1?<$kVMy} zvVd}-s2qvY(d{3CLKfZA7cX_vTSd{J_1k?t_=-nbvFgi~kW8r?E#Ck6+_(B{wt1J? Jde3Jie*?03G #include @@ -28,6 +27,8 @@ ConcordiaServer::ConcordiaServer(const std::string & configFilePath) } _indexController = boost::shared_ptr (new IndexController(_concordiasMap)); _searcherController = boost::shared_ptr (new SearcherController(_concordiasMap)); + + _lemmatizerFacade = boost::shared_ptr (new LemmatizerFacade()); } ConcordiaServer::~ConcordiaServer() { @@ -97,8 +98,7 @@ std::string ConcordiaServer::handleRequest(std::string & requestString) { } else if (operation == "lemmatize") { std::string sentence = _getStringParameter(d, "sentence"); std::string languageCode = _getStringParameter(d, "languageCode"); - SocketLemmatizer lemmatizer; - std::string lemmatizedSentence = lemmatizer.lemmatizeSentence(languageCode, sentence); + std::string lemmatizedSentence = _lemmatizerFacade->lemmatizeSentence(languageCode, sentence); jsonWriter.StartObject(); jsonWriter.String("lemmatizedSentence"); jsonWriter.String(lemmatizedSentence.c_str()); diff --git a/concordia-server/concordia_server.hpp b/concordia-server/concordia_server.hpp index 4214694..c0e11c2 100644 --- a/concordia-server/concordia_server.hpp +++ b/concordia-server/concordia_server.hpp @@ -14,6 +14,8 @@ #include "tm_dao.hpp" #include "index_controller.hpp" #include "searcher_controller.hpp" +#include "lemmatizer_facade.hpp" + class ConcordiaServer { public: @@ -48,6 +50,8 @@ private: boost::shared_ptr _searcherController; + boost::shared_ptr _lemmatizerFacade; + }; #endif diff --git a/concordia-server/config.hpp.in b/concordia-server/config.hpp.in index 093b494..dac7ae6 100644 --- a/concordia-server/config.hpp.in +++ b/concordia-server/config.hpp.in @@ -31,3 +31,4 @@ #define CONCORDIA_PHRASE_SEARCH_OP "concordiaPhraseSearch" #define ADD_TM_OP "addTm" +#define LEMMATIZER_DELIMITER "@#@" diff --git a/concordia-server/lemmatizer_facade.cpp b/concordia-server/lemmatizer_facade.cpp new file mode 100644 index 0000000..f6adc31 --- /dev/null +++ b/concordia-server/lemmatizer_facade.cpp @@ -0,0 +1,30 @@ +#include "lemmatizer_facade.hpp" + + +LemmatizerFacade::LemmatizerFacade() throw(ConcordiaException) { + _lemmatizersMap = boost::ptr_map(); + + // todo: extract this to configuration, especially when new lemmatizers ConstMemberIterator + SocketLemmatizer * socketLemmatizer1 = new SocketLemmatizer(11000); + std::string plCode = "pl"; + std::string enCode = "en"; + std::string hrCode = "hr"; + + _lemmatizersMap.insert(plCode, socketLemmatizer1); + _lemmatizersMap.insert(enCode, socketLemmatizer1); + _lemmatizersMap.insert(hrCode, socketLemmatizer1); +} + +LemmatizerFacade::~LemmatizerFacade() { +} + +std::string LemmatizerFacade::lemmatizeSentence(std::string languageCode, std::string sentence) { + + boost::ptr_map::iterator it = _lemmatizersMap.find(languageCode); + if (it != _lemmatizersMap.end()) { + return it->second->lemmatizeSentence(languageCode, sentence); + } else { + throw ConcordiaException("lemmatizer for language: "+languageCode+" not found."); + } + +} diff --git a/concordia-server/lemmatizer_facade.hpp b/concordia-server/lemmatizer_facade.hpp new file mode 100644 index 0000000..7eea156 --- /dev/null +++ b/concordia-server/lemmatizer_facade.hpp @@ -0,0 +1,25 @@ +#ifndef LEMMATIZER_FACADE_HDR +#define LEMMATIZER_FACADE_HDR + +#include "socket_lemmatizer.hpp" + +#include +#include +#include + + +class LemmatizerFacade { +public: + /*! Constructor. + */ + LemmatizerFacade() throw(ConcordiaException); + /*! Destructor. + */ + virtual ~LemmatizerFacade(); + + std::string lemmatizeSentence(std::string languageCode, std::string sentence); +private: + boost::ptr_map _lemmatizersMap; +}; + +#endif diff --git a/concordia-server/socket_lemmatizer.cpp b/concordia-server/socket_lemmatizer.cpp index f6170a8..0cd6aee 100644 --- a/concordia-server/socket_lemmatizer.cpp +++ b/concordia-server/socket_lemmatizer.cpp @@ -1,8 +1,10 @@ #include "socket_lemmatizer.hpp" -SocketLemmatizer::SocketLemmatizer() throw(ConcordiaException) : - _sock(-1) { - _connect("127.0.0.1" , 11000); +#include "config.hpp" +#include + +SocketLemmatizer::SocketLemmatizer(int port) throw(ConcordiaException) : + _port(port) { } SocketLemmatizer::~SocketLemmatizer() { @@ -11,17 +13,16 @@ SocketLemmatizer::~SocketLemmatizer() { /** Connect to a host on a certain port number */ -bool SocketLemmatizer::_connect(std::string address , int port) -{ - //create socket if it is not already created - if(_sock == -1) { - //Create socket - _sock = socket(AF_INET , SOCK_STREAM , 0); - if (_sock == -1) { - throw ConcordiaException("Could not create socket for the lemmatizer."); - } +bool SocketLemmatizer::_connect() { + + //Create socket + _sock = socket(AF_INET , SOCK_STREAM , 0); + if (_sock == -1) { + throw ConcordiaException("Could not create socket for the lemmatizer."); } + std::string address = "127.0.0.1"; + //setup address structure if(inet_addr(address.c_str()) == -1) { struct hostent *he; @@ -45,16 +46,21 @@ bool SocketLemmatizer::_connect(std::string address , int port) } _server.sin_family = AF_INET; - _server.sin_port = htons(port); + _server.sin_port = htons(_port); //Connect to remote server if (connect(_sock , (struct sockaddr *) & _server , sizeof(_server)) < 0) { - throw ConcordiaException("connect failed. Error"); + throw ConcordiaException("Connect failed. Error on address: "+address+":"+boost::lexical_cast(_port)); } return true; } +bool SocketLemmatizer::_disconnect() { + close(_sock); + _sock = -1; +} + /** Send data to the connected host */ @@ -84,7 +90,9 @@ std::string SocketLemmatizer::_receive(int size=512) } std::string SocketLemmatizer::lemmatizeSentence(std::string languageCode, std::string sentence) { - _send_data(languageCode+sentence+"@#@"); + _connect(); + _send_data(languageCode+sentence+LEMMATIZER_DELIMITER); std::string reply = _receive(512); - return reply.substr(0,reply.find("@#@")); + _disconnect(); + return reply.substr(0,reply.find(LEMMATIZER_DELIMITER)); } diff --git a/concordia-server/socket_lemmatizer.hpp b/concordia-server/socket_lemmatizer.hpp index 7f20255..4f5e9e9 100644 --- a/concordia-server/socket_lemmatizer.hpp +++ b/concordia-server/socket_lemmatizer.hpp @@ -2,9 +2,10 @@ #define SOCKET_LEMMATIZER_HDR #include -#include //socket -#include //inet_addr -#include //hostent +#include //socket +#include //inet_addr +#include //hostent +#include #include @@ -13,23 +14,26 @@ class SocketLemmatizer { public: /*! Constructor. */ - SocketLemmatizer() throw(ConcordiaException); + explicit SocketLemmatizer(int port) throw(ConcordiaException); /*! Destructor. */ virtual ~SocketLemmatizer(); std::string lemmatizeSentence(std::string languageCode, std::string sentence); private: - bool _connect(std::string, int); + bool _connect(); + + bool _disconnect(); bool _send_data(std::string data); - std::string _receive(int); + std::string _receive(int size); + + int _port; int _sock; struct sockaddr_in _server; - }; #endif