From 6a24c1a4bc3e5b895432d7c82b3fff06d3fa71ee Mon Sep 17 00:00:00 2001 From: Mateusz Date: Sun, 9 Jun 2024 16:41:14 +0200 Subject: [PATCH] init --- .gitignore | 3 + 464913.docx | Bin 0 -> 17385 bytes 6_Projekt.ipynb | 1576 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 1579 insertions(+) create mode 100644 .gitignore create mode 100644 464913.docx create mode 100644 6_Projekt.ipynb diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..accd1fd --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +twitter_training.csv +twitter_validation.csv +twitter-entity-sentiment-analysis.zip \ No newline at end of file diff --git a/464913.docx b/464913.docx new file mode 100644 index 0000000000000000000000000000000000000000..665fe82a0a4e05608855f0974357a32ea19e9cb4 GIT binary patch literal 17385 zcmeHv1AAr5wsvgW>Daby+eyc^(P787Z9D1Mww;b`J74y}J^So)zF%OARE;re)RLD120;M;1AqVk03ZNhjI~uW0R#Xj1O)&<27mz460)^%GPZHjQ+BsA zcGRJDv$pz{2LeQ%0|4~Z{{N2u#lJv9!k9%LJ%Y$%;77nr)3RhMd13!ZUbHc`*$ZG8 zv#jvFV1D~Y=Po-#Au$8<$l$lBbmdjTts9W+H?x0xh=hNBJU;E`OY$W;I<4~o)7cPC z%vFB4$QCz8YQMeR%&EvE$dtSjFOjS`kq2@%m(yZC8(V)*2Po`D)-bf!3k5`JKt9A^ zeat7C2pI>*p$?aYO$gc*JrVh11|*c>n~jXnk+ksfF^_e8%r9Vq7>Ch2gpz-N6E*jSVDFnWFVwn4qe z-0Z-;fMFvcZp5;_q}6Cf2-n42b|7+<_1HUfdX9I3${A|G!nC+>qU zA`f2P8a#d-E`7ar9`Ie^9ptoGuxm%Oo?bD)Qe}<0Aw1JW_xX^Jsz>x|iCF6yD8iep z6kF%xR4+6@xyB_tAl3onFqZz-%Tv~Dw~UkN6i{B!mW)W{WUDpuxb<5`V0i6#Ze$Ay z2~-Ur+5W!21o)lp#*7rErya0(OmR0N;LI^qK{LlGBj7ZsRLw&DoJ}M_*4@dF@|3T7 zXwR;ZDu*LCB-k7RBR$SEQ1T(eR0aCTcWi(tsT@Ws3NF95>&f%L9&@WJ5xpvSHU%w# zQaN!>j|SAkV9a=rYs<+4lyEuHW<~zM%8LcZ5h$@opwYGfd_d2L(wU8Z(pjCfn&AEA z7?ry?LNaeobV7QD)eZYh$WPz_X0w@0ywZfaO>7-Vjb5)Rfxs-IntPEQsh z6WY$RrzQjJ6>y;2?tU1?q7hU!EXrwThgy_;%6{#;9o`HvJ9O`b62II>#^XXa-ujdE zeix!OXP97*#SCUIGgk}i!|VxBANV`UfT*zi^8vlB;Uiha?jl(xIW0?iKgr56+`XB4 zP2d5;L|Q%o$9Mmo!PIWkb0wS?k^GYVAIT!&d1y3f@%i(MV})mxD7|1ZEbgdD2~Xfk zLpy|->;-pzso(k$;ePMLOZAE2X%N!-XJpp&!&Z&Y_&$MfAlr;EK-r6u-B-oTrt1&0 zN5=z@ivT{fpJGrpuMd38yOf=eEkPp%gR8OdGtE{J1;xp@KW& z0XlHf8lF_N9vuMK@DPm~I&7P5*>Sh=rpn0tM326I^T;)3@|Nf1&(^Uv_;k4C8O$Ui z!9Mqs8F_&vbK|h*_a&cZi~_S&@5QPznaqC4~7L5!6+HExDa8j zh1U1f-#+i?heWhj^5bZ003)lbj}BpjhVbo#B(3XE9s5VaoNqlFJV zr$ypv;fLAb+}i|nZYDd!KJJX4n2uB0I}DOKlsZq!$5+vQE`q7NB<0+O^<3p=#69`x zlaF32&7xW_8esCHD|kIJTp=g*Cb8@0Amqd=-`d=o6|Z}x3M=h3V>OzzF;bP~NqC?~ z14hLkzkBX^KQ{~`QwtHhO4Nw?;o9l(qnhm>!0{)<+x! zYhcA0e)A1>*z5Wn*^eUVnH|`4fR4+t7)r$MH%HYhv%m({H9Y(HBtAp07_^M9BhkXZ zsocRj0d(8qtqG%&rapa^cX-M)boG^}C%P7sosRi9Mvw;m5N1WT;M{}Ex1IQ@B_XInJeUPj}t)ANNX00p@W!>Flb)09X zFtm%(u*z9<`XZ1RWv-pL^iDd3Q_quD?@-SdqCIWVkzB9Zz>ch-t*_GDVC}^4=d^aS z)-MTj)uHlOce6M0D6!aewQSQ8Fd`Tag+Q>B1iYYIy#%SY;~Jc01eJUQL#%>YsnPi* z_0)$-JMjU>6`-ULt(D@MxX3TSrM8a)_aD7z^~nsm4u|RJ(6(krVSI+cS`JgydR=D1 zdUNv5eHP3Y*j3-u$>w`V@Nn)NUdO+4siaWJFumCl+TM|DjJurpac9-d7C@igl8Zve z$&G$%IVP|dGnNzKqkjj(&)9aG0xc{PfFw?CRcQmUht5=$F7OS5 zANq)R>SA~5T$)u=gY0(I!W?$pkj86}GVB12BFZ4$JmiX> zJ7m6!4ro-uc)05>{)3sQI0Jifs6quvj)T)b6@%&;{pjHW*p=zAxB^IM4nuRfxFbS7 zbXD!?S#jBBcRP`C*KRyg_7&GAP^y|h=h zG-OsSD)(8uiO;Azd0HJ|odNYn#vwNP)oO@-@q?%~N-(+Xvehwyz-l+k6@us!I`{xa zf+)`3dvDZnEy-yN*!@JEP9@lNID1la+MSZ$y%Ev7i?C`G=d#=l&8!dw<1Q@6B#ucv zxRR`)#0()@q14ayqYEOf%uV>LUm;f1q)xJl2%3+Vb1a_+k!`u(uIpl`1kBH|*2?Q0 zTs1CFO>R-lrg+Y8Ad-ntxZ-&-azWdI4Ot1tTBC7NmYf9GQ(3A3R}J(ik|vc~DjC)$ z5JhHcs9#f{w>OfngQ52(2w+vpW6puDSbqm=oLsBjG-KQGb7VM5<*cVElJl)#+cAq= z8MO-8$9#;Kj+sN(=1EJH6KQh!pX?^!Q<87j3oUE;Ug9KWmKeSm7tR^p_=J1{0XcM-~Y^ASO};sEXRApdOYG*__Y zHLSKndkhb12$%C=j-B$SdVO$wsDxN#yp2$Ku!m90iM^&YYfoTsOE7Vt5M%Jt7U)r7 zPmTsU3dqeJJ!6*zmTi{}=at&MHwYgY`8WXLYGF}4PzAI9o$FuNZU}1xtArjnG`I|yMLw(0E?gzdypW^h)i=;gLF z7q*!vEft2{(cf?D3@{96k{bL*9<{PfxP9xLeepD~N`%g9o3o;TdGZQjNQBCY>2&bm zMBF=;oc~Vca!mTEis;PPxdl@#T{T8v<+ZvnamfhWB_?Lg!i9p{Z4{35Z>u^xV$ zQ>*L@#T?k!yNDqpwgvzV(T&$aa2{x?Xwqn}T!-DMky@$17)KXK<_m=;p+J{tW zCf2Gfm1=N9onLN}97fkoUr^y{cX)G4B9=*rSY0K*vIr^KZ2C?wtRZ1ovVypG(t5|x zk0|3vPrtXiZ3=PYXH{hW5wmlKQKZw4S)%&8=47FiAVjFg;?&i8pwy-b57ZgOu*4>V zANBjix78Ol6f!YfmytzHOHFhUoq8o{bN;lB2k2H?xAtqu$f5;c$W(df2bD;4>ft$+ z2UMNf$Um7%%M?_KzoqxgYcH8C47p@(tG4`B%9b3I`^syDTA&a|VKt1qglYU}L?#?b9V|d~#GpY%bcF>h>4qaE*7NxYIwA2RUgd#FuUa zRI}?uI>qNmT<2hhs~K0S(6(IeiE0dRuJ~4(%wM4NtQ(iijzmEp#Zg#q36xsVB)Uf= z1ktY6Ai_%^m?<-WaPGB#pw(}lIfv3(c@eN{IU|* zK@P2?1-OMlUI*^KX2)lwkRf>AuifDPR~ii=g?V0ottoh50RW(WwfGy2oXm`^jp_bY zGW;z#qtO$!#){B^e#8gS%9Ug-(S&{t)?`IyT{lA5CJ{?SGlOO*$xLqH8u=}G4?65& zQ^-#@tN}DFFWBmpX$?)rbC@onfI7UvoY=aFAI=guiQS9ID+3db;`Al59z=^Q#w4VG zr_Fw%mL?^LSn#;Ql#d;6iL)t@Ax0D+9_8VTWc&>K72n4R_pBC!bb9B-yb0XHWqjG7 zT6ap27Hoy3&X?jc9Vrc9Fx^Qtnw)*#1zFpcGfGbmK+>8D6q3!8fB9x!)0e}ay7@w; znEdC1H^(rQz_Q^F$|kC z^|;Tqr>{$=mL%gYN0c6h&l<;-D0%KmCo?NXl}D{vr;keyBmni+>m@ulVjrtnAinf_ z$h5zXSl1@Hb@Wl??Ja9~tl+g^VXK#It+!vZo-EuTDQz}r-rR7Md6oGGDw_iz!8Us2 zo9P4wD!T%0x!!Zv7O#*{3=<}sr=t#={T-SthZDB6MBSDght3b6baPK6<4|8H<`k>s z5W5){kgigu87`A+L17Bfv}m_bep+J4og^5wA2VnA?Q+p}CFx;`*MR%wsf6@eNfDc= zYlzSI-=E2~sA&1MbNE~zyosGoJj0_;>FRR7KJPr=hMWGzMzQsCDx&JRX);f4E(fcW=y;G;0^(l{_>rM z06eATX7LrHUhH;F1^@=O0dN-MDQJgF@nyRm!Wzp@Ie@|RZQd0Ih7-ub>t$vY7?6j$ zW0x~Ms5P(Jxd65^z~WqnI3U-%@)9MD7l2_&3Qie9AOAiYUU5X^2mxymjWE0J;VUS~ z$Y{JsWCRPq=&^?((=cC}V3YeZ*w~zKIKT4wU@Mp9+p>{2gq#!c)DK4>fr^%E(qT;T zhzJ3Hl|9^U1LTo>5?GdC(}EkuN%nx4_JEA=pML-(A$>$&?t1TW>COXSj)L352Onkao+9>C4oOH8#zT%dAF}8fbvWSCX zR-xOO(u>a#Q;b-SxmMyE_lDhs2hHUQEb9?K?zRfOyh7BxVLkrQi!nswb&1CsCN$!M zk5Ck>t7y3Fa&Y3x;@FWlBk@!Ksw23zG>5N9R0ZB`9{rl>$(5;IgtU+l8$LpUn9^i3 zj4@`(z0EN(Ol>G=I>6yk^j-0R?G<^w$+@8YJ{VbA^gHI@^bx`tQ16Vb$#};)lk>$J z%+l3VzjS2_Sdb)JkBJkQBLqO$29j}3ZBP{>v+M>~BV7(ZLmEp+up0r3J;pfqEvxqJ znF!SH&M5eZZq|%?&W-cDfVibRU4~VoKIjsS1(;YtG_<>o)EGWN7{qBJkg`rKFzht# zEt%I0qVUOEs&2R4Wp(Gj^&O)GSp^qZ+;sMa)IRd`Cs!*kEO{qVrBU!jXHY*Uw_u>#y786~OAR(^<)G^5NEZx1KaDz2zH|;f=*}GG)jcOSEYzV7c zrQ0C>yf~Irz-_GuEulzugQJ~1dSfh=U)o_sF*&kWd(=$R&vV;v8`V~M(OvDV^=?0b z3&+`LvjuM<#AgOS)YpCLG$dzzD(t`EO=b74P<+7VpL6gMmGk>}an+`N#dYm{a`>E2 z=Q(Yvy*=q7vHjumzXtD#&?n?RK)%e3uK@o41n(VH=){* z2;1F3>i#rCidUw1ifeTWy2c}~nnA6Ll*Md@F~M+nHfG0}yC*alCbTi28Pri!zM(qLP8&D)E&jk% zyB%TR!`bNtSEi5#$t?P*l#qnTdjsqpd7~EVW@0Q^`8&8yJgiU!*Y6+CD(+5=!>Zc69G`TH>yC9 z_))IGk_%7t-9r^N%u(FJR~6TDCp4$Q8t@RTNE=LW+xb{MwacTf(C;`SJ)?eaKjSeZ zK#woNj{12@SoWBe-b1}`p-LTD!bIuV=C-caXu9GeQ(GwaJY+ldU~BSAaS789(fP>D z^1C-X$B2rdaB>rYcE+3Qg&T&dA-tt|_1Wuq*|dD?E^+zbBYaW-h*7(G1BKeZSl&Vv z%?o7ih-bPdRV6XEfiO(aSpZcYZQ~$XqcPd?<=@}ncSjV3r<}^Br`$65LDoex|?t(D!$alGP_g5~Y{mwUS z5>aCNod&*XIm9IgCZ=>t&{QTif-K9mp4pJ`KKGq8gx^g-OJ=)VHc*aXfRGjR;^QHP zm5jUI{{1?ngE&8{(AR#*d4!Fh+u>D5grI|ISO|NK={zLf;W9Q?q%zpvChktY1l_Ya(bMrR1Lk_;+zLpk80>52OIW@KtLK+|TSzpM;0jhcL5IMQskl{6Prs;SZ zSV(3`zA^0Fy@RB5fTgq{qdC2?sZ1Y=Jz%x<>84vbE9ee~5D->UM|w$|g|l!iA-;O4 zjzzy;WE@g6;_`T}d5!kdePpq&dK-J5S;uqz>}=o{aa%`CVXAL25HII^pUL4}NzKKAa9t@+{*yNoyjU#w!vGL}&+s1KxBflh2MKD8f6Z-OXq2fg;ir=$!1>o9V zSXZWbw2)rWSQ##Tu*x{+Any%*V(^IGKLsGsg}a-zg~FhCMq5S{7B(!CkHodEh1Coc zTii##k&S)Wr13JSs(k8DI~@c5YEk5V6wca&6pXvUqLCu9HlsWz7lktC9(@6g`sBN6B4U8SWhRZ)j%kktjn_X6f zpiAgGT#ya2`H?17b<>GC*`jbQv64dCZ9h$1h7+rwFm55n&Uv@8`$clrFf|HN*7r{?r>hGZ(?> zmtCG+QoHJ{C4ZD30~#HD`!Kd62kfR=i}t{s2q{MXdRi6lMs_w*Y2eku|mh z)eXw(B!bD}`Q76dEuhpb3jxv183fNH9}>FXSX4cw5v9dRiX?9Ev+TAxm#oo;$qDZ# ztS2@11uh|7konw88Nuqp#UmpP2V#*TEJZKB7=@E;copYWDYN>x*6@QCkQImJxl14) z{lZ6S!^YB@9RM6;*9zC<$#9piI(yWc=DC`#3_7*Uv&TnkkLN5FhZ04a!)IMt&>Te7f#g% zkug4 zvfezs=D|VmDWQbZ`xIOnk50{&Li1uGSM@BlD7oizaWp|x-O=s%Xz-8Cv~^VNMnNCa zO+R_gp=Ve3UK8_H=v*RK8dn#u{2S4>H-^^eqjo(4t;ICQ$#fFoFw#eytN-vjZiLWO`jY2Ljjl?MGB;XsVc~$E?bhkWg+Ra z06$`ChiMtmOsa}TEeu5GnfI=bC@SP-$Qij7OqQ=GKf(@`qTMN{cXU|jb-gjDHg{*s z&7Hf}G<}ZT#aa|Q?t+xT&zSqaLjy(1{`wa6c^MH-%ec!Q96tGrvx}B)H_y%h(&scH zUC(TAqCK?m?VQoZJ{FTrOS)p=Q(Bp+-5~=nAGns}) zd( zU1;5sVg$V=@CG4A@RrIuV#ufzYdFF+t~q&^e`iNAlb#A~hl4^No8na=r=g8{XUwQ0 zxTn#rP*$5w{V<#24p+s5H_r+eC9@xqf{?Nt--GZ)LNnn$4ByE#i>QhfguET;d`alX z?q-0q0|1zd>DN>GKnIe%3=XU!Lzlo{!RFF|n?G>(7Gc}(SHGn5^w@+f4 zZFQvJYq)^C?OpboBxJ=JL%QV~m?T5Wj)H?Lqd~6&i?n8tz%R2dce}h8tQa{CDe~kq zahTJ#lM^FeOV5<(tF=*oUNXC|cul8t6A>&LB?$Kq>t{&4kTM%L%Hm}IkpJ=AnMC{T z)c;fvCFf1ca42U;gcoteSe2BcHksDO?MXaegTN&tN?F5TH~O?K;#!)xZf*u#rD&Nw zAF|CrL0D-w_&84F?l7*1&jHK2>C>e&E_M^)#+i~AKTBE#rQiK-pQL~)b+vypK|I?l z5v=y@KFu0R?clTYJMwub>njNfV;6Y5_0KY1DEn_T4*8AeT}`6}iq_z3=5?gEc}CdnUf?0E|7sHhV7s{Zc{#X|`FrD6+guB_ ztLuZy?uG24n+?YTR8`>ktU=8vS_V1=otgY}@-}lERFdZ1%l3#sw23a(F`$wOD-QyE zQK0aT-?#*l1sA*S1WTTPdLYmbHIrt%zjl~^Y_zK5vxr3NAsGBC?*@qrS(3*+P z#k{~+`N+2*!OS`{J|d)3-E2jQ%nds}stD6b@ewmhqeBSF=r81?cMnDhFY3zW=gD^jJ&#SK{Rj$u?DdfXWYVu*ce5J;bnRsy*`^?&i zqgxAW`KzL-K5ps6ydYR}klX7nKB!t(jTw2YZm6d~3UBBwRrDjIo^aQ=A~lXrp7xmJ znQp{F!l7tU5?1jgI;1@}G4{U2Vq}2P<`IxStI?hwNfcD1kO^`<~lH}An!#98_ zcZI8dY$bfiw;?&s&*E&TtWXiVmy^;)(=0f6ie=DS`r!m(Ja@uSb5g0)WCrEy4;j6~ z45L{5RUOeW%P^>uP7?QCDZ)>;6j+MQOJc*DUomtF8=iic!LYaYZ}3=f za?X10Tl-)69?rQ@d}XCobq~SVyfS86L2`%*Yy(%{c9(<J0BX#~tu$Rk-0{SGaYqoiZ4qO!X0P z47{oh653A6)9Yn@oi<-5uRMJ<9ENvb6d<;D>K{d=EaWXSDE;@&o$=GF%VX+jz*|&P zr~|4=lyS8Tij-=0t~Z?N0YPd1cUT6nS+#LYaUWnA&1s_nrnn9dLGRFG;A=bIhJp=a zF9KWEZF6_TX7B=TQC!aqt7LXP*$;`s%`#FA-qZc<5b{vNNgiCSH5m5MI`{KcS|(?* zp)Jbw0phR6C&pagFk(ZsI97kt6mfo1ZgIzX=H7>wBtoyXDE2Bd?&MQ!-1yx#<(#&r za;kXYo!Z)@nc3j9pp3jd)xcxVBg=aZMQaK}@BwuKwkBxZUKl;k3u@cCM!FT|kZd_%BXj zYmQ1AhXoSE66EaPw5^v{49CFutp%5X@`uuZZza_v>_raqs5eC<>U+4(BCY0-CYfS7 zD@zR^eOn4@vns6>w33K@i1ZhdOhMQE2AheMu#U?EIvlI?KY7*@Q(=XcdvuAB&I z95$PHFIME{US1UWkD$Z?>k>iF$O6Fbg++*i3yW||3syzk z%-Om9nlHtV~+f=CR{%GFaD%)S_|z zo4mF!c@~a;$UFF_sJMlH!V6{9zK|||6#syK;Tl{1vmeK*DfIBszG)=HR6ssc;cbM^ zb>I5%WuJ&9%qDlqut2@jJpwHDpIWY8s76$6Uy-}$gtJG;a0IoG~W-+-XByPqdg+R6%h zc8z!gElBPYA!{{_$(J^%4QNwKs9tP<*xzH*%<)!M=vh99AD@(y>siO^{X8_Q>a^Z; zV@;OBM&_}zE4Ub)rQ@G-IN0CKu*uqK{k9R^8Zy~D6wAlCiAUz@J+rp=F$lefm6VP? zJTh^354|Ty@mZz|o;)OmuM8y5r&_x0B&=*8Ut7RBR;~ha^j1e~r7PgNd;`_$kbX_d zA$9h%*P3`&W)wPSV2A+>qC7&{Vvl@jLs)C-A|u}nU?VqX6TG}cga!5fR-iVxOYAWGIU*NPf^ z@A4`iUG}kAHm_z#sZmYEU6s=~Gf5=9};R#TYZQ*Ecn(WSL} z%Yw3V&67-ngN6|!tmsw*5_QDwK!Gv#_EH1OXGz8a;vmVfnOLY4S_y3``^Vssoi*99HDi#1jOR)^aCDAB}872kCHs|-8NDM>IU z+&=3b`9khhB(PS1QAtCjc>U0zp3<`IWxWgJl3eU>7x|99;20|P;w0xFQF9HLfPHW@ z+oHq;MS;tP@P>1e{zk3FqsTW;FVauzL%mcK)|JPsk)^$?<8X}Tc5a3A_mT3MN>*?< zR1GGCGw;*t`Ywy+W-8&yL{_<;5xf&Q7rM zJfs&BrLElYF!4>Cs(CJrEb|$=YnpA2O)N$+o8%kXAQ`vRd8wT0^0GH7r8m|rt=m3X zQS%Ji8l;uKU3FxuPPX;v05hyk_x<9X8{O;}O$>0tjuz-|e{YPBIZJQ6TaTrkD199N zcehW(bZ)}xYx4*BSJ(&TpKhP4v4PT`!sKTu*0$;N2)2zlFZ`FcWr`-&)USYeGjjpt`X z6NN;A)|4|TJe5CSD?xQwv7l@s!o@~tDk=@74xRQe>b1fq34R`Okf%+iBIqX zk`4kKh6bQ{>Z2EJG8CSyn?l^DhvTsmUfF%iWa8m(!Tb5`dK13O?G|q=p{XKh`++5_ zou>hmPX7Cm$`to|C zI>p6=4eV+u9hNMFgd>(*>pN&d`l-!jmP9kAsO~A&8I#^>2O4fevw_OH=nwD-seM#+ z=o9pg+ugEx-8wneMr@eeHu~rfUOMKAEv^;-$g1p>)m}6L&n(z_ZSGsMl*$V+-hSp2 z@bj_jT%LGfj#07Rt|tAMVb>4D_;WsS0aRk-L@A_ZINHT?9PVia@x($#XBk8V z+20>WB%}w~s@SAI`QWQE6s5^G(adqQ#mF~q=kN&P=8P(aX0%2Ob87`d#k_%EB!w)0 zU3f=IK%Otr;Ws~Ye6k(#{nr(oxh+n@;4gN*L;fi!Y4Wv$P+8x=%J`3< z?@9e*D-ek5nd2Y#0=;Ye3c|=TAdKh0fGAJv>!(zx?fl6t|C6y}uR)TeSe%=9Bx6jD(lQ@mBi&aEFe<$B`kY&Qrxbde(FJ19pD1P(N- znw_~#llTLxP0@EK`r2{_M~*g3F_;%0-kL^nYi=QJawPr*v_i)v{9eo_lbOhO#P#5h z<$GzNjBlkO`Y5n{gg8W^b3&+z{sg+pM#{SdVYaaar1yD}GqB2g7U#gl-0Or$-E6ze zNUW=Oazx7xq{Xp0i(rFyP=K(A@Ck_f>+zYlBPN?B{q5_qzrm4#p91Bv8nA1LIyQ%K z_i#2VRpmsJ5HRI}!h>Rur9{sRQGt=gt5q`mN5eSN*fKVU^rL46B5?Rh)F*Z zbN(L4`Q+2~3WWdlysJ2MmM}P@U!0>I&eot{^R4m$Ui66WyC=+ua?GX=T`%+rmnjnPQM$xtV?tl&wXW=-k@1Tg%sk*!4?! z3s80J&Ngh+zxCxSV*Z~wPRSkkbFp89e*K~@;#ZE-S7MXAgRPw-ouRG6-&p h*WK>qe2KhEgxk;^{R zmUTRFixq5{#&;QDP}HiO6We`nM~^5*z#@oq0%Iksi}!{sl62$xiI(EjMEerXy5q}l zcBBZbXd?mwnb2@3uOyhwv!S_2bMb`m&eyzCw*l+&)1m)L4&WFgB<+DNrXD;zi(=ObaT?&6+)>2JG(yy-~XfW7l*%&45c2MHF}s1_+>tfR<^o$0xe*Xff$1$eJ$XA>!~zpvR)&J znB|<6rRX~JsTGehxy#tQIb?~@^yEo2v5*nBW|M^qBT)8p?A-^;2YH{Cm;SJyqb#Ey z;Fd1_{Wg2At_IjYd_%w#1(0YfR{$9**yUaTG9{TLxkej!U7$h-c^<7Tnb$qu*ZqWe z>BdNc(Lt8@9Z^LESjnrRo>XTBPtWLh7K`~k6z8Ww%Mtb---?4Yo z>w04ER@-1#AJ^#}I?1s5^O|Dh_t&HSyHWrG(R^9Q|NTD={>Fem760WQ4&zV-*a#O1^$|)(Eb7c|77C+RnlM6FaIS= z9pnF#jQLj)f6dPMmk2Mse=a}$$qD%@{IBcP|ALni{R94QtJi3VPk6F_-u{19 zzy1pTtJm}|a1P5qz<=?f{wm?GmhHbJ@Us6y!oS(Mf5rd1iSaKq0Kh#b0Kk739DjxX j`^f%hcn8m);Qtus<)uKs=mh`(^YsG!V(2XYAD{jo`?9$t literal 0 HcmV?d00001 diff --git a/6_Projekt.ipynb b/6_Projekt.ipynb new file mode 100644 index 0000000..96d86ae --- /dev/null +++ b/6_Projekt.ipynb @@ -0,0 +1,1576 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "5K66MlDpZDnE" + }, + "source": [ + "## Analiza sentymentu w opiniach z Twitter'a\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "OaqaYfQFZDnH" + }, + "source": [ + "### Download dataset and prepare data\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "23k83t7RNCJa" + }, + "source": [ + "#### Installation of packages\n" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "I5pSpk6PNCJb", + "outputId": "3f30ecd9-104a-496a-fd52-447f4d64e814" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (2.0.3)\n", + "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas) (2.8.2)\n", + "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas) (2023.4)\n", + "Requirement already satisfied: tzdata>=2022.1 in /usr/local/lib/python3.10/dist-packages (from pandas) (2024.1)\n", + "Requirement already satisfied: numpy>=1.21.0 in /usr/local/lib/python3.10/dist-packages (from pandas) (1.25.2)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.2->pandas) (1.16.0)\n", + "Requirement already satisfied: scikit-learn in /usr/local/lib/python3.10/dist-packages (1.2.2)\n", + "Requirement already satisfied: numpy>=1.17.3 in /usr/local/lib/python3.10/dist-packages (from scikit-learn) (1.25.2)\n", + "Requirement already satisfied: scipy>=1.3.2 in /usr/local/lib/python3.10/dist-packages (from scikit-learn) (1.11.4)\n", + "Requirement already satisfied: joblib>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from scikit-learn) (1.4.2)\n", + "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn) (3.5.0)\n", + "Requirement already satisfied: emoji in /usr/local/lib/python3.10/dist-packages (2.12.1)\n", + "Requirement already satisfied: typing-extensions>=4.7.0 in /usr/local/lib/python3.10/dist-packages (from emoji) (4.12.1)\n", + "Requirement already satisfied: gensim in /usr/local/lib/python3.10/dist-packages (4.3.2)\n", + "Requirement already satisfied: numpy>=1.18.5 in /usr/local/lib/python3.10/dist-packages (from gensim) (1.25.2)\n", + "Requirement already satisfied: scipy>=1.7.0 in /usr/local/lib/python3.10/dist-packages (from gensim) (1.11.4)\n", + "Requirement already satisfied: smart-open>=1.8.1 in /usr/local/lib/python3.10/dist-packages (from gensim) (6.4.0)\n" + ] + } + ], + "source": [ + "%pip install pandas\n", + "%pip install scikit-learn\n", + "%pip install emoji\n", + "%pip install gensim" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "FA_aZGAkNCJd" + }, + "source": [ + "#### Importing libraries\n" + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "metadata": { + "id": "yQvOCaX2NCJd" + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from sklearn.model_selection import train_test_split\n", + "import emoji\n", + "from gensim.utils import simple_preprocess" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "gp8ITdbPNCJe" + }, + "source": [ + "#### Download the dataset\n" + ] + }, + { + "cell_type": "code", + "execution_count": 100, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "DlcNiu4UNCJe", + "outputId": "015b3ad1-6b9d-4845-dd98-0b0c085b12c9" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Dataset URL: https://www.kaggle.com/datasets/jp797498e/twitter-entity-sentiment-analysis\n", + "License(s): CC0-1.0\n", + "twitter-entity-sentiment-analysis.zip: Skipping, found more recently modified local copy (use --force to force download)\n" + ] + } + ], + "source": [ + "!kaggle datasets download -d jp797498e/twitter-entity-sentiment-analysis" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "yU4XFDrUNCJf" + }, + "source": [ + "#### Unzip the dataset\n" + ] + }, + { + "cell_type": "code", + "execution_count": 101, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "G2gaml-MNCJf", + "outputId": "e327c071-a0cd-480f-92d3-66388fd4dfcb" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Archive: twitter-entity-sentiment-analysis.zip\n", + " inflating: twitter_training.csv \n", + " inflating: twitter_validation.csv \n" + ] + } + ], + "source": [ + "!unzip -o twitter-entity-sentiment-analysis.zip" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "bBO6YHwyNCJg" + }, + "source": [ + "#### Load the dataset\n" + ] + }, + { + "cell_type": "code", + "execution_count": 102, + "metadata": { + "id": "9KlnXJTtNCJg" + }, + "outputs": [], + "source": [ + "cols = [\"tweetid\", \"entity\", \"sentiment\", \"content\"]\n", + "twitter_training = pd.read_csv(\"twitter_training.csv\", names=cols)\n", + "twitter_validation = pd.read_csv(\"twitter_validation.csv\", names=cols)\n", + "dataset = pd.concat([twitter_training, twitter_validation])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "XIslo9YQNCJg" + }, + "source": [ + "#### Info about the dataset\n" + ] + }, + { + "cell_type": "code", + "execution_count": 103, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "rnh5-0SZNCJh", + "outputId": "99319b5c-f4e2-4aee-e963-13e8d2e938ee" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "Index: 75682 entries, 0 to 999\n", + "Data columns (total 4 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 tweetid 75682 non-null int64 \n", + " 1 entity 75682 non-null object\n", + " 2 sentiment 75682 non-null object\n", + " 3 content 74996 non-null object\n", + "dtypes: int64(1), object(3)\n", + "memory usage: 2.9+ MB\n" + ] + } + ], + "source": [ + "dataset.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 104, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "rpHNMU57NCJh", + "outputId": "576fba81-c5fc-47ee-aae9-4f1734081e97" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "(75682, 4)" + ] + }, + "metadata": {}, + "execution_count": 104 + } + ], + "source": [ + "dataset.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 105, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "eylMuu0GNCJj", + "outputId": "d04a8e0a-42ac-4f70-f277-5b9300e97016" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "sentiment\n", + "Negative 22808\n", + "Positive 21109\n", + "Neutral 18603\n", + "Irrelevant 13162\n", + "Name: count, dtype: int64" + ] + }, + "metadata": {}, + "execution_count": 105 + } + ], + "source": [ + "dataset[\"sentiment\"].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 106, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "fm7H57JINCJj", + "outputId": "6af989a7-c3e7-4666-afef-c2859265d027" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "tweetid 0\n", + "entity 0\n", + "sentiment 0\n", + "content 686\n", + "dtype: int64" + ] + }, + "metadata": {}, + "execution_count": 106 + } + ], + "source": [ + "dataset.isna().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 107, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "AF_ZNH6pNCJk", + "outputId": "f3191e1e-1176-4c08-9c3e-31eee38020d8" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "3217" + ] + }, + "metadata": {}, + "execution_count": 107 + } + ], + "source": [ + "dataset.duplicated().sum()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "LeIs8ceHNCJl" + }, + "source": [ + "#### Prepare the dataset\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "GJfxQkbWNCJl" + }, + "source": [ + "##### Drop tweetid and entity columns\n" + ] + }, + { + "cell_type": "code", + "execution_count": 108, + "metadata": { + "id": "X3GwAqSQNCJl" + }, + "outputs": [], + "source": [ + "dataset = dataset.drop(columns=[\"tweetid\", \"entity\"], axis=1)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "WpBDzbx6NCJm" + }, + "source": [ + "##### Drop null values\n" + ] + }, + { + "cell_type": "code", + "execution_count": 109, + "metadata": { + "id": "ixlmP6cwNCJm" + }, + "outputs": [], + "source": [ + "dataset.dropna(inplace=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Z_0UNES2NCJm" + }, + "source": [ + "##### Remove emojis\n" + ] + }, + { + "cell_type": "code", + "execution_count": 110, + "metadata": { + "id": "I9Mr8rAQNCJm" + }, + "outputs": [], + "source": [ + "dataset[\"content\"] = dataset[\"content\"].apply(\n", + " lambda x: emoji.replace_emoji(x, replace=\"\")\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "wLptSqrzNCJm" + }, + "source": [ + "##### Simple Preprocess\n" + ] + }, + { + "cell_type": "code", + "execution_count": 111, + "metadata": { + "id": "gw8HC9XBNCJm" + }, + "outputs": [], + "source": [ + "dataset[\"content\"] = dataset[\"content\"].apply(lambda x: \" \".join(simple_preprocess(x)))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "vayfwdnkNCJm" + }, + "source": [ + "##### Drop null values\n" + ] + }, + { + "cell_type": "code", + "execution_count": 112, + "metadata": { + "id": "6r1_Hk1JNCJn" + }, + "outputs": [], + "source": [ + "dataset.dropna(inplace=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "k2aDiHxrNCJn" + }, + "source": [ + "##### Drop duplicates\n" + ] + }, + { + "cell_type": "code", + "execution_count": 113, + "metadata": { + "id": "56YaoLvjNCJn" + }, + "outputs": [], + "source": [ + "dataset.drop_duplicates(inplace=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "exgWXEmNNCJn" + }, + "source": [ + "#### Info about the dataset after cleaning\n" + ] + }, + { + "cell_type": "code", + "execution_count": 114, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "1oaFMRqANCJn", + "outputId": "05560ac6-9dd5-4397-8730-59239af28fc6" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "Index: 65839 entries, 0 to 991\n", + "Data columns (total 2 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 sentiment 65839 non-null object\n", + " 1 content 65839 non-null object\n", + "dtypes: object(2)\n", + "memory usage: 1.5+ MB\n" + ] + } + ], + "source": [ + "dataset.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 115, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "92f8IAAINCJo", + "outputId": "383826e9-6f8b-4e66-c7f9-2efacb8a5c96" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "(65839, 2)" + ] + }, + "metadata": {}, + "execution_count": 115 + } + ], + "source": [ + "dataset.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 116, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "F7a05XcCNCJo", + "outputId": "4b189aa3-8df7-44be-aa20-9066d4cde04a" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "sentiment\n", + "Negative 20147\n", + "Positive 17868\n", + "Neutral 16193\n", + "Irrelevant 11631\n", + "Name: count, dtype: int64" + ] + }, + "metadata": {}, + "execution_count": 116 + } + ], + "source": [ + "dataset[\"sentiment\"].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 117, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "GG3Qgk44NCJo", + "outputId": "4959a695-513d-47e0-cbe9-b05d149478cf" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "sentiment 0\n", + "content 0\n", + "dtype: int64" + ] + }, + "metadata": {}, + "execution_count": 117 + } + ], + "source": [ + "dataset.isna().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 118, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "u5g9cVa1NCJo", + "outputId": "214e37d0-71b0-4616-fb37-2b47e365ee14" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "0" + ] + }, + "metadata": {}, + "execution_count": 118 + } + ], + "source": [ + "dataset.duplicated().sum()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "eVZuSi-SNCJo" + }, + "source": [ + "#### Split the dataset into training and testing sets\n" + ] + }, + { + "cell_type": "code", + "execution_count": 119, + "metadata": { + "id": "BCy_q1GHNCJp" + }, + "outputs": [], + "source": [ + "X_train, X_test, y_train, y_test = train_test_split(\n", + " dataset[\"content\"], dataset[\"sentiment\"], test_size=0.2, random_state=0\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 120, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "-qSKofcjNCJt", + "outputId": "d2e09e82-4174-4e87-a0be-5e9158668733" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "((52671,), (13168,), (52671,), (13168,))" + ] + }, + "metadata": {}, + "execution_count": 120 + } + ], + "source": [ + "X_train.shape, X_test.shape, y_train.shape, y_test.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "UmMEl5AYNCJt" + }, + "source": [ + "### TD-IDF - Logistic Regression\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2_scDhXqZDnJ" + }, + "source": [ + "#### Importing libraries\n" + ] + }, + { + "cell_type": "code", + "execution_count": 121, + "metadata": { + "id": "ugm_fVSiZDnK" + }, + "outputs": [], + "source": [ + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.metrics import classification_report" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "X8DY5eStNCJu" + }, + "source": [ + "#### Text Vectorization Using TF-IDF\n" + ] + }, + { + "cell_type": "code", + "execution_count": 122, + "metadata": { + "id": "IBAy8zjcNCJu" + }, + "outputs": [], + "source": [ + "vectorizer = TfidfVectorizer()\n", + "X_train_tfidf = vectorizer.fit_transform(X_train)\n", + "X_test_tfidf = vectorizer.transform(X_test)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "rS5pptZINCJu" + }, + "source": [ + "#### Training a Logistic Regression model\n" + ] + }, + { + "cell_type": "code", + "execution_count": 123, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 75 + }, + "id": "m3tmiTWVNCJu", + "outputId": "e9372b78-9ea9-4a4a-8289-f3e9ee6ba511" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "LogisticRegression(max_iter=1000)" + ], + "text/html": [ + "
LogisticRegression(max_iter=1000)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ] + }, + "metadata": {}, + "execution_count": 123 + } + ], + "source": [ + "model = LogisticRegression(solver=\"lbfgs\", penalty=\"l2\", max_iter=1000)\n", + "model.fit(X_train_tfidf, y_train)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "GiY_P6PKNCJu" + }, + "source": [ + "#### Predicting\n" + ] + }, + { + "cell_type": "code", + "execution_count": 124, + "metadata": { + "id": "CJ_9qh6ONCJu" + }, + "outputs": [], + "source": [ + "y_pred = model.predict(X_test_tfidf)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "koeb78PsNCJu" + }, + "source": [ + "#### Classification report\n" + ] + }, + { + "cell_type": "code", + "execution_count": 125, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "hsABx8mJNCJv", + "outputId": "c4c23ca6-c88a-4db9-fe66-36a7febd3594" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + " precision recall f1-score support\n", + "\n", + " Irrelevant 0.82 0.70 0.75 2304\n", + " Negative 0.80 0.86 0.83 4024\n", + " Neutral 0.79 0.74 0.77 3169\n", + " Positive 0.78 0.82 0.80 3671\n", + "\n", + " accuracy 0.79 13168\n", + " macro avg 0.80 0.78 0.79 13168\n", + "weighted avg 0.79 0.79 0.79 13168\n", + "\n" + ] + } + ], + "source": [ + "print(classification_report(y_test, y_pred))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Y17ccTy1NCJv" + }, + "source": [ + "### TD-IDF - Random Forest Classifier\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "yyk_baF-NCJv" + }, + "source": [ + "#### Importing libraries\n" + ] + }, + { + "cell_type": "code", + "execution_count": 126, + "metadata": { + "id": "-xjXLHpQNCJv" + }, + "outputs": [], + "source": [ + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.metrics import classification_report" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Tl6mOx92NCJw" + }, + "source": [ + "#### Text Vectorization Using TF-IDF\n" + ] + }, + { + "cell_type": "code", + "execution_count": 127, + "metadata": { + "id": "bE9h15BcNCJw" + }, + "outputs": [], + "source": [ + "vectorizer = TfidfVectorizer()\n", + "X_train_tfidf = vectorizer.fit_transform(X_train)\n", + "X_test_tfidf = vectorizer.transform(X_test)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "cNUGJWXINCJw" + }, + "source": [ + "#### Training a Random Forest Classifier model\n" + ] + }, + { + "cell_type": "code", + "execution_count": 128, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 75 + }, + "id": "WTrPtycbNCJw", + "outputId": "e97b690c-f698-414a-cc40-4843d12e2073" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "RandomForestClassifier()" + ], + "text/html": [ + "
RandomForestClassifier()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ] + }, + "metadata": {}, + "execution_count": 128 + } + ], + "source": [ + "model = RandomForestClassifier(criterion=\"gini\")\n", + "model.fit(X_train_tfidf, y_train)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "HPlAbp8PNCJx" + }, + "source": [ + "#### Predicting\n" + ] + }, + { + "cell_type": "code", + "execution_count": 129, + "metadata": { + "id": "0ePAr1uZNCJx" + }, + "outputs": [], + "source": [ + "y_pred = model.predict(X_test_tfidf)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "oPnSaSB-NCJx" + }, + "source": [ + "#### Classification report\n" + ] + }, + { + "cell_type": "code", + "execution_count": 130, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "gqRJRLcKNCJx", + "outputId": "b4b1bbfb-5b76-4936-cb74-e200dc72e1c6" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + " precision recall f1-score support\n", + "\n", + " Irrelevant 0.95 0.87 0.91 2304\n", + " Negative 0.92 0.95 0.93 4024\n", + " Neutral 0.94 0.91 0.93 3169\n", + " Positive 0.90 0.94 0.92 3671\n", + "\n", + " accuracy 0.93 13168\n", + " macro avg 0.93 0.92 0.92 13168\n", + "weighted avg 0.93 0.93 0.92 13168\n", + "\n" + ] + } + ], + "source": [ + "print(classification_report(y_test, y_pred))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "18jz3yhuNCJy" + }, + "source": [ + "### Word2Vec - LSTM\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "0gizZeVCNCJy" + }, + "source": [ + "#### Installation of packages\n" + ] + }, + { + "cell_type": "code", + "execution_count": 131, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Sy0x-OwPNCJy", + "outputId": "9815f9df-920a-48c0-f8c3-c174b2544ee4" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Requirement already satisfied: tensorflow in /usr/local/lib/python3.10/dist-packages (2.15.0)\n", + "Requirement already satisfied: absl-py>=1.0.0 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (1.4.0)\n", + "Requirement already satisfied: astunparse>=1.6.0 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (1.6.3)\n", + "Requirement already satisfied: flatbuffers>=23.5.26 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (24.3.25)\n", + "Requirement already satisfied: gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (0.5.4)\n", + "Requirement already satisfied: google-pasta>=0.1.1 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (0.2.0)\n", + "Requirement already satisfied: h5py>=2.9.0 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (3.9.0)\n", + "Requirement already satisfied: libclang>=13.0.0 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (18.1.1)\n", + "Requirement already satisfied: ml-dtypes~=0.2.0 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (0.2.0)\n", + "Requirement already satisfied: numpy<2.0.0,>=1.23.5 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (1.25.2)\n", + "Requirement already satisfied: opt-einsum>=2.3.2 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (3.3.0)\n", + "Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from tensorflow) (24.0)\n", + "Requirement already satisfied: protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (3.20.3)\n", + "Requirement already satisfied: setuptools in /usr/local/lib/python3.10/dist-packages (from tensorflow) (67.7.2)\n", + "Requirement already satisfied: six>=1.12.0 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (1.16.0)\n", + "Requirement already satisfied: termcolor>=1.1.0 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (2.4.0)\n", + "Requirement already satisfied: typing-extensions>=3.6.6 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (4.12.1)\n", + "Requirement already satisfied: wrapt<1.15,>=1.11.0 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (1.14.1)\n", + "Requirement already satisfied: tensorflow-io-gcs-filesystem>=0.23.1 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (0.37.0)\n", + "Requirement already satisfied: grpcio<2.0,>=1.24.3 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (1.64.1)\n", + "Requirement already satisfied: tensorboard<2.16,>=2.15 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (2.15.2)\n", + "Requirement already satisfied: tensorflow-estimator<2.16,>=2.15.0 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (2.15.0)\n", + "Requirement already satisfied: keras<2.16,>=2.15.0 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (2.15.0)\n", + "Requirement already satisfied: wheel<1.0,>=0.23.0 in /usr/local/lib/python3.10/dist-packages (from astunparse>=1.6.0->tensorflow) (0.43.0)\n", + "Requirement already satisfied: google-auth<3,>=1.6.3 in /usr/local/lib/python3.10/dist-packages (from tensorboard<2.16,>=2.15->tensorflow) (2.27.0)\n", + "Requirement already satisfied: google-auth-oauthlib<2,>=0.5 in /usr/local/lib/python3.10/dist-packages (from tensorboard<2.16,>=2.15->tensorflow) (1.2.0)\n", + "Requirement already satisfied: markdown>=2.6.8 in /usr/local/lib/python3.10/dist-packages (from tensorboard<2.16,>=2.15->tensorflow) (3.6)\n", + "Requirement already satisfied: requests<3,>=2.21.0 in /usr/local/lib/python3.10/dist-packages (from tensorboard<2.16,>=2.15->tensorflow) (2.31.0)\n", + "Requirement already satisfied: tensorboard-data-server<0.8.0,>=0.7.0 in /usr/local/lib/python3.10/dist-packages (from tensorboard<2.16,>=2.15->tensorflow) (0.7.2)\n", + "Requirement already satisfied: werkzeug>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from tensorboard<2.16,>=2.15->tensorflow) (3.0.3)\n", + "Requirement already satisfied: cachetools<6.0,>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from google-auth<3,>=1.6.3->tensorboard<2.16,>=2.15->tensorflow) (5.3.3)\n", + "Requirement already satisfied: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.10/dist-packages (from google-auth<3,>=1.6.3->tensorboard<2.16,>=2.15->tensorflow) (0.4.0)\n", + "Requirement already satisfied: rsa<5,>=3.1.4 in /usr/local/lib/python3.10/dist-packages (from google-auth<3,>=1.6.3->tensorboard<2.16,>=2.15->tensorflow) (4.9)\n", + "Requirement already satisfied: requests-oauthlib>=0.7.0 in /usr/local/lib/python3.10/dist-packages (from google-auth-oauthlib<2,>=0.5->tensorboard<2.16,>=2.15->tensorflow) (1.3.1)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2.21.0->tensorboard<2.16,>=2.15->tensorflow) (3.3.2)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2.21.0->tensorboard<2.16,>=2.15->tensorflow) (3.7)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2.21.0->tensorboard<2.16,>=2.15->tensorflow) (2.0.7)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2.21.0->tensorboard<2.16,>=2.15->tensorflow) (2024.6.2)\n", + "Requirement already satisfied: MarkupSafe>=2.1.1 in /usr/local/lib/python3.10/dist-packages (from werkzeug>=1.0.1->tensorboard<2.16,>=2.15->tensorflow) (2.1.5)\n", + "Requirement already satisfied: pyasn1<0.7.0,>=0.4.6 in /usr/local/lib/python3.10/dist-packages (from pyasn1-modules>=0.2.1->google-auth<3,>=1.6.3->tensorboard<2.16,>=2.15->tensorflow) (0.6.0)\n", + "Requirement already satisfied: oauthlib>=3.0.0 in /usr/local/lib/python3.10/dist-packages (from requests-oauthlib>=0.7.0->google-auth-oauthlib<2,>=0.5->tensorboard<2.16,>=2.15->tensorflow) (3.2.2)\n", + "Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (1.25.2)\n" + ] + } + ], + "source": [ + "%pip install tensorflow\n", + "%pip install numpy" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nyh33SHPNCJy" + }, + "source": [ + "#### Importing libraries\n" + ] + }, + { + "cell_type": "code", + "execution_count": 132, + "metadata": { + "id": "WGINcl6pNCJy" + }, + "outputs": [], + "source": [ + "from gensim.models import Word2Vec\n", + "import numpy as np\n", + "import tensorflow as tf\n", + "from sklearn.calibration import LabelEncoder" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JrQ66Il0NCJy" + }, + "source": [ + "#### Function to convert text to Word2Vec vectors\n" + ] + }, + { + "cell_type": "code", + "execution_count": 133, + "metadata": { + "id": "3MEhNRL0NCJz" + }, + "outputs": [], + "source": [ + "def text_to_vector(text, word2vec, vector_size):\n", + " words = simple_preprocess(text)\n", + " text_vector = np.zeros(vector_size)\n", + " word_count = 0\n", + " for word in words:\n", + " if word in word2vec.wv:\n", + " text_vector += word2vec.wv[word]\n", + " word_count += 1\n", + " if word_count > 0:\n", + " text_vector /= word_count\n", + " return text_vector" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JH_t6_ZtNCJz" + }, + "source": [ + "#### Tokenize texts\n" + ] + }, + { + "cell_type": "code", + "execution_count": 134, + "metadata": { + "id": "KVzBCEbWNCJz" + }, + "outputs": [], + "source": [ + "tokenized_text = dataset[\"content\"].apply(lambda x: x.split())" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "o_fLSc_uNCJz" + }, + "source": [ + "#### Vector size parameter\n" + ] + }, + { + "cell_type": "code", + "execution_count": 147, + "metadata": { + "id": "sLY4J1nTNCJ0" + }, + "outputs": [], + "source": [ + "vector_size = 100" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "XIMHHrRqNCJ0" + }, + "source": [ + "#### Train Word2Vec model\n" + ] + }, + { + "cell_type": "code", + "execution_count": 148, + "metadata": { + "id": "UysosPtiNCJ1" + }, + "outputs": [], + "source": [ + "model_word2vec = Word2Vec(\n", + " tokenized_text, window=5, min_count=2, workers=4, vector_size=vector_size, epochs=20\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "xufmoLDlNCJ1" + }, + "source": [ + "#### Convert texts to Word2Vec vectors\n" + ] + }, + { + "cell_type": "code", + "execution_count": 149, + "metadata": { + "id": "QrY3vXcXNCJ1" + }, + "outputs": [], + "source": [ + "train_vectors = np.array(\n", + " [text_to_vector(text, model_word2vec, vector_size) for text in X_train]\n", + ")\n", + "\n", + "test_vectors = np.array(\n", + " [text_to_vector(text, model_word2vec, vector_size) for text in X_test]\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-3hER130NCJ2" + }, + "source": [ + "#### Find the maximum sequence length in the training set\n" + ] + }, + { + "cell_type": "code", + "execution_count": 150, + "metadata": { + "id": "gcxbUr4lNCJ2" + }, + "outputs": [], + "source": [ + "max_len = max(len(seq) for seq in train_vectors)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ezJ1OadFNCJ3" + }, + "source": [ + "#### Pad sequences to the same length\n" + ] + }, + { + "cell_type": "code", + "execution_count": 151, + "metadata": { + "id": "1oGWbRZtNCJ3" + }, + "outputs": [], + "source": [ + "X_train_emb = tf.keras.preprocessing.sequence.pad_sequences(\n", + " train_vectors, maxlen=max_len, dtype=\"float32\", padding=\"post\"\n", + ")\n", + "X_test_emb = tf.keras.preprocessing.sequence.pad_sequences(\n", + " test_vectors, maxlen=max_len, dtype=\"float32\", padding=\"post\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "LaBGGtm3NCJ4" + }, + "source": [ + "#### Encode labels\n" + ] + }, + { + "cell_type": "code", + "execution_count": 152, + "metadata": { + "id": "Xe96PgKtNCJ4" + }, + "outputs": [], + "source": [ + "label_encoder = LabelEncoder()\n", + "y_train_enc = label_encoder.fit_transform(y_train)\n", + "y_test_enc = label_encoder.transform(y_test)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "P_fI-cZHNCJ4" + }, + "source": [ + "#### Define LSTM model\n" + ] + }, + { + "cell_type": "markdown", + "source": [], + "metadata": { + "id": "sEUnVQJEP-hy" + } + }, + { + "cell_type": "code", + "execution_count": 153, + "metadata": { + "id": "pF5HZSRKNCJ4" + }, + "outputs": [], + "source": [ + "model = tf.keras.Sequential(\n", + " [\n", + " tf.keras.layers.Embedding(input_dim=X_train_emb.shape[1], output_dim=100),\n", + " tf.keras.layers.LSTM(128),\n", + " tf.keras.layers.Dense(64, activation=\"relu\"),\n", + " tf.keras.layers.Dense(32, activation=\"relu\"),\n", + " tf.keras.layers.Dense(4, activation=\"softmax\"),\n", + " ]\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "YNk8m5lnNCJ4" + }, + "source": [ + "#### Compile the model\n" + ] + }, + { + "cell_type": "code", + "execution_count": 154, + "metadata": { + "id": "IqDLE1FuNCJ5" + }, + "outputs": [], + "source": [ + "model.compile(\n", + " optimizer=tf.optimizers.Adam(learning_rate=1e-3),\n", + " loss=\"sparse_categorical_crossentropy\",\n", + " metrics=[\"accuracy\"],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "cIRnkLT0NCJ5" + }, + "source": [ + "#### Train the model\n" + ] + }, + { + "cell_type": "code", + "execution_count": 155, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "QFpStkX9NCJ5", + "outputId": "af188e61-04a4-45e7-e6f3-ef50cba478da" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 1/50\n", + "823/823 [==============================] - 10s 9ms/step - loss: 1.3439 - accuracy: 0.3438\n", + "Epoch 2/50\n", + "823/823 [==============================] - 7s 9ms/step - loss: 1.3261 - accuracy: 0.3678\n", + "Epoch 3/50\n", + "823/823 [==============================] - 6s 8ms/step - loss: 1.3163 - accuracy: 0.3774\n", + "Epoch 4/50\n", + "823/823 [==============================] - 7s 9ms/step - loss: 1.3020 - accuracy: 0.3975\n", + "Epoch 5/50\n", + "823/823 [==============================] - 6s 7ms/step - loss: 1.2904 - accuracy: 0.4119\n", + "Epoch 6/50\n", + "823/823 [==============================] - 8s 9ms/step - loss: 1.2814 - accuracy: 0.4186\n", + "Epoch 7/50\n", + "823/823 [==============================] - 6s 7ms/step - loss: 1.2741 - accuracy: 0.4262\n", + "Epoch 8/50\n", + "823/823 [==============================] - 8s 9ms/step - loss: 1.2667 - accuracy: 0.4325\n", + "Epoch 9/50\n", + "823/823 [==============================] - 6s 7ms/step - loss: 1.2588 - accuracy: 0.4372\n", + "Epoch 10/50\n", + "823/823 [==============================] - 7s 9ms/step - loss: 1.2513 - accuracy: 0.4407\n", + "Epoch 11/50\n", + "823/823 [==============================] - 6s 7ms/step - loss: 1.2451 - accuracy: 0.4450\n", + "Epoch 12/50\n", + "823/823 [==============================] - 7s 8ms/step - loss: 1.2365 - accuracy: 0.4491\n", + "Epoch 13/50\n", + "823/823 [==============================] - 6s 7ms/step - loss: 1.2291 - accuracy: 0.4560\n", + "Epoch 14/50\n", + "823/823 [==============================] - 7s 9ms/step - loss: 1.2218 - accuracy: 0.4593\n", + "Epoch 15/50\n", + "823/823 [==============================] - 6s 7ms/step - loss: 1.2144 - accuracy: 0.4636\n", + "Epoch 16/50\n", + "823/823 [==============================] - 7s 9ms/step - loss: 1.2066 - accuracy: 0.4669\n", + "Epoch 17/50\n", + "823/823 [==============================] - 6s 7ms/step - loss: 1.1989 - accuracy: 0.4707\n", + "Epoch 18/50\n", + "823/823 [==============================] - 7s 9ms/step - loss: 1.1887 - accuracy: 0.4759\n", + "Epoch 19/50\n", + "823/823 [==============================] - 7s 9ms/step - loss: 1.1810 - accuracy: 0.4803\n", + "Epoch 20/50\n", + "823/823 [==============================] - 7s 9ms/step - loss: 1.1717 - accuracy: 0.4846\n", + "Epoch 21/50\n", + "823/823 [==============================] - 6s 7ms/step - loss: 1.1631 - accuracy: 0.4883\n", + "Epoch 22/50\n", + "823/823 [==============================] - 7s 8ms/step - loss: 1.1533 - accuracy: 0.4948\n", + "Epoch 23/50\n", + "823/823 [==============================] - 6s 7ms/step - loss: 1.1426 - accuracy: 0.4983\n", + "Epoch 24/50\n", + "823/823 [==============================] - 7s 9ms/step - loss: 1.1338 - accuracy: 0.5040\n", + "Epoch 25/50\n", + "823/823 [==============================] - 6s 7ms/step - loss: 1.1229 - accuracy: 0.5075\n", + "Epoch 26/50\n", + "823/823 [==============================] - 7s 8ms/step - loss: 1.1126 - accuracy: 0.5125\n", + "Epoch 27/50\n", + "823/823 [==============================] - 6s 7ms/step - loss: 1.1042 - accuracy: 0.5167\n", + "Epoch 28/50\n", + "823/823 [==============================] - 7s 8ms/step - loss: 1.0920 - accuracy: 0.5237\n", + "Epoch 29/50\n", + "823/823 [==============================] - 6s 7ms/step - loss: 1.0809 - accuracy: 0.5266\n", + "Epoch 30/50\n", + "823/823 [==============================] - 7s 8ms/step - loss: 1.0730 - accuracy: 0.5307\n", + "Epoch 31/50\n", + "823/823 [==============================] - 6s 7ms/step - loss: 1.0628 - accuracy: 0.5357\n", + "Epoch 32/50\n", + "823/823 [==============================] - 7s 9ms/step - loss: 1.0536 - accuracy: 0.5422\n", + "Epoch 33/50\n", + "823/823 [==============================] - 6s 7ms/step - loss: 1.0399 - accuracy: 0.5480\n", + "Epoch 34/50\n", + "823/823 [==============================] - 7s 9ms/step - loss: 1.0350 - accuracy: 0.5503\n", + "Epoch 35/50\n", + "823/823 [==============================] - 6s 7ms/step - loss: 1.0237 - accuracy: 0.5553\n", + "Epoch 36/50\n", + "823/823 [==============================] - 7s 9ms/step - loss: 1.0217 - accuracy: 0.5550\n", + "Epoch 37/50\n", + "823/823 [==============================] - 6s 7ms/step - loss: 1.0073 - accuracy: 0.5633\n", + "Epoch 38/50\n", + "823/823 [==============================] - 7s 8ms/step - loss: 0.9927 - accuracy: 0.5703\n", + "Epoch 39/50\n", + "823/823 [==============================] - 6s 8ms/step - loss: 0.9848 - accuracy: 0.5732\n", + "Epoch 40/50\n", + "823/823 [==============================] - 6s 8ms/step - loss: 0.9786 - accuracy: 0.5748\n", + "Epoch 41/50\n", + "823/823 [==============================] - 7s 8ms/step - loss: 0.9735 - accuracy: 0.5774\n", + "Epoch 42/50\n", + "823/823 [==============================] - 6s 7ms/step - loss: 0.9633 - accuracy: 0.5839\n", + "Epoch 43/50\n", + "823/823 [==============================] - 7s 8ms/step - loss: 0.9530 - accuracy: 0.5873\n", + "Epoch 44/50\n", + "823/823 [==============================] - 6s 7ms/step - loss: 0.9506 - accuracy: 0.5893\n", + "Epoch 45/50\n", + "823/823 [==============================] - 7s 9ms/step - loss: 0.9364 - accuracy: 0.5958\n", + "Epoch 46/50\n", + "823/823 [==============================] - 6s 7ms/step - loss: 0.9260 - accuracy: 0.6006\n", + "Epoch 47/50\n", + "823/823 [==============================] - 7s 9ms/step - loss: 0.9257 - accuracy: 0.6008\n", + "Epoch 48/50\n", + "823/823 [==============================] - 6s 7ms/step - loss: 0.9155 - accuracy: 0.6048\n", + "Epoch 49/50\n", + "823/823 [==============================] - 7s 8ms/step - loss: 0.9103 - accuracy: 0.6066\n", + "Epoch 50/50\n", + "823/823 [==============================] - 6s 7ms/step - loss: 0.8999 - accuracy: 0.6122\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "" + ] + }, + "metadata": {}, + "execution_count": 155 + } + ], + "source": [ + "model.fit(X_train_emb, y_train_enc, epochs=50, batch_size=64)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "CAoGoGZ7NCJ5" + }, + "source": [ + "#### Predicting\n" + ] + }, + { + "cell_type": "code", + "execution_count": 156, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "LCtJlNP9NCJ5", + "outputId": "15942b31-270a-4817-9b55-f5a48663dbed" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "412/412 [==============================] - 2s 4ms/step\n" + ] + } + ], + "source": [ + "y_pred = model.predict(X_test_emb)\n", + "\n", + "y_preds_argmax = []\n", + "for i in range(len(y_pred)):\n", + " y_preds_argmax.append(y_pred[i].argmax())" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Ee3GIUHJNCJ6" + }, + "source": [ + "#### Classification report\n" + ] + }, + { + "cell_type": "code", + "execution_count": 157, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "MMCmZDLgNCJ6", + "outputId": "67ad7bb2-386c-432f-dd95-0c3105a13f0c" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.32 0.20 0.25 2304\n", + " 1 0.46 0.62 0.53 4024\n", + " 2 0.44 0.43 0.44 3169\n", + " 3 0.45 0.39 0.42 3671\n", + "\n", + " accuracy 0.44 13168\n", + " macro avg 0.42 0.41 0.41 13168\n", + "weighted avg 0.43 0.44 0.42 13168\n", + "\n" + ] + } + ], + "source": [ + "print(classification_report(y_test_enc, y_preds_argmax))" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "gpuType": "T4", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file