This commit is contained in:
wangobango 2022-01-24 19:22:04 +01:00
parent a1bd9211bc
commit 7a12d96497
15 changed files with 3287 additions and 45 deletions

5
.gitignore vendored
View File

@ -1,3 +1,6 @@
venv/*
dist/*
*.egg-info
*.egg-info
.vscode
__pycache__
*.pt

13
Dockerfile Normal file
View File

@ -0,0 +1,13 @@
FROM anibali/pytorch:1.8.1-cuda11.1-ubuntu20.04
WORKDIR /app
COPY requirements.txt /app/requirements.txt
RUN python3 -m pip install -r requirements.txt
RUN python3 -m pip install gdown
RUN python3 -m spacy download en_core_web_sm
RUN python3 -m pip install transformers
RUN gdown --fuzzy https://drive.google.com/file/d/1_zMsEVPOUzQe8yFzRj0-Q08BA8LN4H9x/view?usp=sharing
COPY *.py /app/
EXPOSE 4999
ENTRYPOINT ["python3", "app.py"]

38
app.py Normal file
View File

@ -0,0 +1,38 @@
from flask import Flask, request
from utils import POC
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch
import json
app = Flask(__name__)
poc = POC()
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')
model_name = "model-4.pt"
is_available = torch.cuda.is_available()
if(is_available):
model.load_state_dict(torch.load(model_name))
else:
model.load_state_dict(torch.load(model_name, map_location=torch.device('cpu')))
"""
Example:
{
"data": "asdasdas",
"length": 300
}
"""
@app.route("/generate", methods=['POST'])
def hello_world():
data = request.data
data_dict = json.loads(data)
x = poc.generate(model, tokenizer, data_dict['data'], entry_count=1, entry_length=data_dict['length'])
print(x)
return " ".join(x)
if __name__ == "__main__":
app.run(host='0.0.0.0', port=4999)

View File

@ -0,0 +1,736 @@
ArticleId,Category
1018,sport
1319,tech
1138,business
459,entertainment
1020,politics
51,sport
2025,tech
1479,business
27,entertainment
397,politics
1644,sport
263,tech
765,business
2134,entertainment
297,politics
1712,sport
1631,tech
942,business
1549,entertainment
516,politics
2215,sport
531,tech
1541,business
1340,entertainment
56,politics
2138,sport
338,tech
133,business
215,entertainment
521,politics
1777,sport
185,tech
2105,business
1556,entertainment
659,politics
1837,sport
59,tech
383,business
1581,entertainment
2167,politics
1593,sport
1849,tech
1691,business
2033,entertainment
2092,politics
849,sport
1780,tech
1747,business
1509,entertainment
1305,politics
2055,sport
2139,tech
57,business
1496,entertainment
671,politics
1193,sport
2104,tech
872,business
606,entertainment
1502,politics
244,sport
381,tech
1944,business
725,entertainment
369,politics
211,sport
1861,tech
1044,business
976,entertainment
1673,politics
442,sport
144,tech
1954,business
48,entertainment
608,politics
2165,sport
913,tech
1580,business
1061,entertainment
529,politics
2137,sport
1284,tech
2040,business
349,entertainment
1385,politics
1307,sport
807,tech
333,business
786,entertainment
1775,politics
558,sport
592,tech
2172,business
1735,entertainment
1188,politics
511,sport
157,tech
646,business
8,entertainment
821,politics
1001,sport
309,tech
1732,business
1291,entertainment
2136,politics
999,sport
1258,tech
721,business
1922,entertainment
954,politics
847,sport
846,tech
313,business
1997,entertainment
322,politics
1221,sport
129,tech
981,business
330,entertainment
1172,politics
2123,sport
1165,tech
1592,business
2094,entertainment
841,politics
1224,sport
72,tech
1176,business
1285,entertainment
1671,politics
67,sport
952,tech
884,business
2011,entertainment
1979,politics
519,sport
1132,tech
1782,business
112,entertainment
1996,politics
683,sport
1447,tech
472,business
562,entertainment
1476,politics
1339,sport
1972,tech
875,business
1067,entertainment
1442,politics
708,sport
515,tech
118,business
1513,entertainment
1227,politics
1196,sport
635,tech
1669,business
1823,entertainment
483,politics
1477,sport
509,tech
667,business
127,entertainment
1316,politics
1983,sport
598,tech
1654,business
1232,entertainment
703,politics
2051,sport
451,tech
1924,business
852,entertainment
1889,politics
1695,sport
417,tech
2125,business
2087,entertainment
1274,politics
579,sport
398,tech
1898,business
87,entertainment
2024,politics
1927,sport
1019,tech
1892,business
357,entertainment
1161,politics
52,sport
1798,tech
2028,business
58,entertainment
258,politics
638,sport
541,tech
2190,business
1936,entertainment
1293,politics
2181,sport
1329,tech
146,business
89,entertainment
1627,politics
1123,sport
1233,tech
580,business
591,entertainment
1933,politics
911,sport
971,tech
1154,business
1622,entertainment
539,politics
1131,sport
1839,tech
545,business
1910,entertainment
799,politics
1016,sport
1460,tech
1286,business
296,entertainment
1345,politics
1568,sport
502,tech
1530,business
81,entertainment
2162,politics
1519,sport
1634,tech
1586,business
469,entertainment
382,politics
353,sport
1666,tech
1457,business
2182,entertainment
1591,politics
1643,sport
1822,tech
742,business
1956,entertainment
1829,politics
1675,sport
376,tech
1626,business
168,entertainment
321,politics
715,sport
1278,tech
653,business
1206,entertainment
411,politics
931,sport
1576,tech
1207,business
1729,entertainment
851,politics
994,sport
223,tech
763,business
977,entertainment
1094,politics
582,sport
1845,tech
965,business
1461,entertainment
945,politics
746,sport
1768,tech
1242,business
259,entertainment
866,politics
1288,sport
915,tech
337,business
1842,entertainment
975,politics
1890,sport
1694,tech
367,business
1270,entertainment
1679,politics
717,sport
838,tech
1857,business
798,entertainment
1915,politics
1051,sport
99,tech
810,business
1662,entertainment
1521,politics
1693,sport
347,tech
60,business
1958,entertainment
723,politics
402,sport
662,tech
3,business
84,entertainment
1551,politics
1684,sport
806,tech
1721,business
446,entertainment
326,politics
2211,sport
2037,tech
1158,business
498,entertainment
2122,politics
1905,sport
564,tech
939,business
1,entertainment
567,politics
1219,sport
1720,tech
1015,business
1459,entertainment
633,politics
1384,sport
242,tech
1334,business
1456,entertainment
1298,politics
1567,sport
205,tech
385,business
795,entertainment
1083,politics
1953,sport
1987,tech
1748,business
1869,entertainment
34,politics
604,sport
142,tech
1832,business
1865,entertainment
862,politics
1191,sport
22,tech
1883,business
737,entertainment
1262,politics
1601,sport
1404,tech
1296,business
2163,entertainment
1168,politics
438,sport
1888,tech
694,business
1663,entertainment
861,politics
704,sport
1010,tech
2039,business
854,entertainment
1562,politics
1422,sport
874,tech
1272,business
1405,entertainment
2049,politics
620,sport
2022,tech
925,business
1573,entertainment
1187,politics
1471,sport
504,tech
576,business
222,entertainment
95,politics
332,sport
1895,tech
2106,business
1181,entertainment
776,politics
9,sport
808,tech
1002,business
784,entertainment
845,politics
1416,sport
1762,tech
525,business
596,entertainment
1800,politics
1341,sport
113,tech
134,business
270,entertainment
1324,politics
12,sport
782,tech
1357,business
1608,entertainment
1134,politics
276,sport
41,tech
1705,business
732,entertainment
968,politics
1006,sport
1140,tech
966,business
1399,entertainment
586,politics
868,sport
1698,tech
452,business
600,entertainment
354,politics
1281,sport
914,tech
1430,business
554,entertainment
1659,politics
1727,sport
640,tech
691,business
1531,entertainment
1389,politics
1545,sport
462,tech
2101,business
1907,entertainment
1373,politics
1787,sport
2202,tech
961,business
680,entertainment
1059,politics
626,sport
1961,tech
1048,business
336,entertainment
719,politics
1563,sport
963,tech
179,business
1438,entertainment
1189,politics
2126,sport
895,tech
2005,business
819,entertainment
23,politics
457,sport
1423,tech
100,business
1448,entertainment
97,politics
172,sport
1918,tech
938,business
752,entertainment
2222,politics
2038,sport
24,tech
1325,business
409,entertainment
624,politics
1367,sport
621,tech
1302,business
505,entertainment
2057,politics
265,sport
262,tech
2176,business
1304,entertainment
1539,politics
700,sport
2124,tech
2203,business
781,entertainment
1558,politics
1419,sport
1311,tech
124,business
664,entertainment
282,politics
1700,sport
1107,tech
922,business
652,entertainment
713,politics
1068,sport
1116,tech
869,business
1703,entertainment
1668,politics
501,sport
2053,tech
908,business
1306,entertainment
348,politics
149,sport
1173,tech
6,business
1180,entertainment
107,politics
1629,sport
131,tech
1046,business
698,entertainment
751,politics
1501,sport
73,tech
1463,business
1856,entertainment
1074,politics
2225,sport
1313,tech
933,business
1136,entertainment
779,politics
1315,sport
1628,tech
432,business
546,entertainment
1779,politics
1514,sport
1365,tech
1776,business
109,entertainment
235,politics
162,sport
2071,tech
412,business
1092,entertainment
1352,politics
294,sport
1973,tech
493,business
45,entertainment
2189,politics
1417,sport
2140,tech
1265,business
1353,entertainment
830,politics
448,sport
535,tech
1656,business
996,entertainment
1246,politics
161,sport
1267,tech
138,business
1575,entertainment
1810,politics
2065,sport
768,tech
1507,business
1520,entertainment
71,politics
1484,sport
163,tech
1171,business
769,entertainment
77,politics
1252,sport
532,tech
1975,business
178,entertainment
1237,politics
1899,sport
676,tech
2008,business
1807,entertainment
481,politics
17,sport
837,tech
523,business
2099,entertainment
1957,politics
1470,sport
1151,tech
489,business
823,entertainment
1743,politics
1035,sport
2103,tech
1230,business
1636,entertainment
103,politics
927,sport
379,tech
1789,business
278,entertainment
887,politics
1300,sport
394,tech
625,business
1770,entertainment
1959,politics
1371,sport
1359,tech
921,business
404,entertainment
422,politics
503,sport
1773,tech
2174,business
1086,entertainment
1986,politics
2007,sport
13,tech
796,business
585,entertainment
1200,politics
642,sport
237,tech
1826,business
43,entertainment
1594,politics
594,sport
1084,tech
1859,business
524,entertainment
463,politics
1085,sport
668,tech
2132,business
1970,entertainment
260,politics
141,sport
468,tech
269,business
414,entertainment
2097,politics
1420,sport
1472,tech
1757,business
243,entertainment
1142,politics
1295,sport
1660,tech
1635,business
2200,entertainment
2146,politics
5,sport
1256,tech
2090,business
1816,entertainment
1991,politics
1310,sport
1162,tech
499,business
285,entertainment
1216,politics
745,sport
421,tech
2068,business
1145,entertainment
681,politics
556,sport
1063,tech
55,business
910,entertainment
2066,politics
1025,sport
2154,tech
64,business
2218,entertainment
2013,politics
1105,sport
669,tech
827,business
1335,entertainment
1426,politics
2196,sport
368,tech
1535,business
1672,entertainment
599,politics
2085,sport
1577,tech
724,business
568,entertainment
2018,politics
1543,sport
308,tech
1356,business
573,entertainment
1746,politics
1290,sport
272,tech
139,business
2191,entertainment
1255,politics
1699,sport
1616,tech
1619,business
1032,entertainment
287,politics
2194,sport
928,tech
584,business
1722,entertainment
1685,politics
1828,sport
692,tech
1393,business
540,entertainment
1317,politics
152,sport
650,tech
191,business
1719,entertainment
1564,politics
350,sport
116,tech
1000,business
36,entertainment
1225,politics
2155,sport
553,tech
551,business
1724,entertainment
1512,politics
1923,sport
373,tech
1704,business
206,entertainment
471,politics
1 ArticleId Category
2 1018 sport
3 1319 tech
4 1138 business
5 459 entertainment
6 1020 politics
7 51 sport
8 2025 tech
9 1479 business
10 27 entertainment
11 397 politics
12 1644 sport
13 263 tech
14 765 business
15 2134 entertainment
16 297 politics
17 1712 sport
18 1631 tech
19 942 business
20 1549 entertainment
21 516 politics
22 2215 sport
23 531 tech
24 1541 business
25 1340 entertainment
26 56 politics
27 2138 sport
28 338 tech
29 133 business
30 215 entertainment
31 521 politics
32 1777 sport
33 185 tech
34 2105 business
35 1556 entertainment
36 659 politics
37 1837 sport
38 59 tech
39 383 business
40 1581 entertainment
41 2167 politics
42 1593 sport
43 1849 tech
44 1691 business
45 2033 entertainment
46 2092 politics
47 849 sport
48 1780 tech
49 1747 business
50 1509 entertainment
51 1305 politics
52 2055 sport
53 2139 tech
54 57 business
55 1496 entertainment
56 671 politics
57 1193 sport
58 2104 tech
59 872 business
60 606 entertainment
61 1502 politics
62 244 sport
63 381 tech
64 1944 business
65 725 entertainment
66 369 politics
67 211 sport
68 1861 tech
69 1044 business
70 976 entertainment
71 1673 politics
72 442 sport
73 144 tech
74 1954 business
75 48 entertainment
76 608 politics
77 2165 sport
78 913 tech
79 1580 business
80 1061 entertainment
81 529 politics
82 2137 sport
83 1284 tech
84 2040 business
85 349 entertainment
86 1385 politics
87 1307 sport
88 807 tech
89 333 business
90 786 entertainment
91 1775 politics
92 558 sport
93 592 tech
94 2172 business
95 1735 entertainment
96 1188 politics
97 511 sport
98 157 tech
99 646 business
100 8 entertainment
101 821 politics
102 1001 sport
103 309 tech
104 1732 business
105 1291 entertainment
106 2136 politics
107 999 sport
108 1258 tech
109 721 business
110 1922 entertainment
111 954 politics
112 847 sport
113 846 tech
114 313 business
115 1997 entertainment
116 322 politics
117 1221 sport
118 129 tech
119 981 business
120 330 entertainment
121 1172 politics
122 2123 sport
123 1165 tech
124 1592 business
125 2094 entertainment
126 841 politics
127 1224 sport
128 72 tech
129 1176 business
130 1285 entertainment
131 1671 politics
132 67 sport
133 952 tech
134 884 business
135 2011 entertainment
136 1979 politics
137 519 sport
138 1132 tech
139 1782 business
140 112 entertainment
141 1996 politics
142 683 sport
143 1447 tech
144 472 business
145 562 entertainment
146 1476 politics
147 1339 sport
148 1972 tech
149 875 business
150 1067 entertainment
151 1442 politics
152 708 sport
153 515 tech
154 118 business
155 1513 entertainment
156 1227 politics
157 1196 sport
158 635 tech
159 1669 business
160 1823 entertainment
161 483 politics
162 1477 sport
163 509 tech
164 667 business
165 127 entertainment
166 1316 politics
167 1983 sport
168 598 tech
169 1654 business
170 1232 entertainment
171 703 politics
172 2051 sport
173 451 tech
174 1924 business
175 852 entertainment
176 1889 politics
177 1695 sport
178 417 tech
179 2125 business
180 2087 entertainment
181 1274 politics
182 579 sport
183 398 tech
184 1898 business
185 87 entertainment
186 2024 politics
187 1927 sport
188 1019 tech
189 1892 business
190 357 entertainment
191 1161 politics
192 52 sport
193 1798 tech
194 2028 business
195 58 entertainment
196 258 politics
197 638 sport
198 541 tech
199 2190 business
200 1936 entertainment
201 1293 politics
202 2181 sport
203 1329 tech
204 146 business
205 89 entertainment
206 1627 politics
207 1123 sport
208 1233 tech
209 580 business
210 591 entertainment
211 1933 politics
212 911 sport
213 971 tech
214 1154 business
215 1622 entertainment
216 539 politics
217 1131 sport
218 1839 tech
219 545 business
220 1910 entertainment
221 799 politics
222 1016 sport
223 1460 tech
224 1286 business
225 296 entertainment
226 1345 politics
227 1568 sport
228 502 tech
229 1530 business
230 81 entertainment
231 2162 politics
232 1519 sport
233 1634 tech
234 1586 business
235 469 entertainment
236 382 politics
237 353 sport
238 1666 tech
239 1457 business
240 2182 entertainment
241 1591 politics
242 1643 sport
243 1822 tech
244 742 business
245 1956 entertainment
246 1829 politics
247 1675 sport
248 376 tech
249 1626 business
250 168 entertainment
251 321 politics
252 715 sport
253 1278 tech
254 653 business
255 1206 entertainment
256 411 politics
257 931 sport
258 1576 tech
259 1207 business
260 1729 entertainment
261 851 politics
262 994 sport
263 223 tech
264 763 business
265 977 entertainment
266 1094 politics
267 582 sport
268 1845 tech
269 965 business
270 1461 entertainment
271 945 politics
272 746 sport
273 1768 tech
274 1242 business
275 259 entertainment
276 866 politics
277 1288 sport
278 915 tech
279 337 business
280 1842 entertainment
281 975 politics
282 1890 sport
283 1694 tech
284 367 business
285 1270 entertainment
286 1679 politics
287 717 sport
288 838 tech
289 1857 business
290 798 entertainment
291 1915 politics
292 1051 sport
293 99 tech
294 810 business
295 1662 entertainment
296 1521 politics
297 1693 sport
298 347 tech
299 60 business
300 1958 entertainment
301 723 politics
302 402 sport
303 662 tech
304 3 business
305 84 entertainment
306 1551 politics
307 1684 sport
308 806 tech
309 1721 business
310 446 entertainment
311 326 politics
312 2211 sport
313 2037 tech
314 1158 business
315 498 entertainment
316 2122 politics
317 1905 sport
318 564 tech
319 939 business
320 1 entertainment
321 567 politics
322 1219 sport
323 1720 tech
324 1015 business
325 1459 entertainment
326 633 politics
327 1384 sport
328 242 tech
329 1334 business
330 1456 entertainment
331 1298 politics
332 1567 sport
333 205 tech
334 385 business
335 795 entertainment
336 1083 politics
337 1953 sport
338 1987 tech
339 1748 business
340 1869 entertainment
341 34 politics
342 604 sport
343 142 tech
344 1832 business
345 1865 entertainment
346 862 politics
347 1191 sport
348 22 tech
349 1883 business
350 737 entertainment
351 1262 politics
352 1601 sport
353 1404 tech
354 1296 business
355 2163 entertainment
356 1168 politics
357 438 sport
358 1888 tech
359 694 business
360 1663 entertainment
361 861 politics
362 704 sport
363 1010 tech
364 2039 business
365 854 entertainment
366 1562 politics
367 1422 sport
368 874 tech
369 1272 business
370 1405 entertainment
371 2049 politics
372 620 sport
373 2022 tech
374 925 business
375 1573 entertainment
376 1187 politics
377 1471 sport
378 504 tech
379 576 business
380 222 entertainment
381 95 politics
382 332 sport
383 1895 tech
384 2106 business
385 1181 entertainment
386 776 politics
387 9 sport
388 808 tech
389 1002 business
390 784 entertainment
391 845 politics
392 1416 sport
393 1762 tech
394 525 business
395 596 entertainment
396 1800 politics
397 1341 sport
398 113 tech
399 134 business
400 270 entertainment
401 1324 politics
402 12 sport
403 782 tech
404 1357 business
405 1608 entertainment
406 1134 politics
407 276 sport
408 41 tech
409 1705 business
410 732 entertainment
411 968 politics
412 1006 sport
413 1140 tech
414 966 business
415 1399 entertainment
416 586 politics
417 868 sport
418 1698 tech
419 452 business
420 600 entertainment
421 354 politics
422 1281 sport
423 914 tech
424 1430 business
425 554 entertainment
426 1659 politics
427 1727 sport
428 640 tech
429 691 business
430 1531 entertainment
431 1389 politics
432 1545 sport
433 462 tech
434 2101 business
435 1907 entertainment
436 1373 politics
437 1787 sport
438 2202 tech
439 961 business
440 680 entertainment
441 1059 politics
442 626 sport
443 1961 tech
444 1048 business
445 336 entertainment
446 719 politics
447 1563 sport
448 963 tech
449 179 business
450 1438 entertainment
451 1189 politics
452 2126 sport
453 895 tech
454 2005 business
455 819 entertainment
456 23 politics
457 457 sport
458 1423 tech
459 100 business
460 1448 entertainment
461 97 politics
462 172 sport
463 1918 tech
464 938 business
465 752 entertainment
466 2222 politics
467 2038 sport
468 24 tech
469 1325 business
470 409 entertainment
471 624 politics
472 1367 sport
473 621 tech
474 1302 business
475 505 entertainment
476 2057 politics
477 265 sport
478 262 tech
479 2176 business
480 1304 entertainment
481 1539 politics
482 700 sport
483 2124 tech
484 2203 business
485 781 entertainment
486 1558 politics
487 1419 sport
488 1311 tech
489 124 business
490 664 entertainment
491 282 politics
492 1700 sport
493 1107 tech
494 922 business
495 652 entertainment
496 713 politics
497 1068 sport
498 1116 tech
499 869 business
500 1703 entertainment
501 1668 politics
502 501 sport
503 2053 tech
504 908 business
505 1306 entertainment
506 348 politics
507 149 sport
508 1173 tech
509 6 business
510 1180 entertainment
511 107 politics
512 1629 sport
513 131 tech
514 1046 business
515 698 entertainment
516 751 politics
517 1501 sport
518 73 tech
519 1463 business
520 1856 entertainment
521 1074 politics
522 2225 sport
523 1313 tech
524 933 business
525 1136 entertainment
526 779 politics
527 1315 sport
528 1628 tech
529 432 business
530 546 entertainment
531 1779 politics
532 1514 sport
533 1365 tech
534 1776 business
535 109 entertainment
536 235 politics
537 162 sport
538 2071 tech
539 412 business
540 1092 entertainment
541 1352 politics
542 294 sport
543 1973 tech
544 493 business
545 45 entertainment
546 2189 politics
547 1417 sport
548 2140 tech
549 1265 business
550 1353 entertainment
551 830 politics
552 448 sport
553 535 tech
554 1656 business
555 996 entertainment
556 1246 politics
557 161 sport
558 1267 tech
559 138 business
560 1575 entertainment
561 1810 politics
562 2065 sport
563 768 tech
564 1507 business
565 1520 entertainment
566 71 politics
567 1484 sport
568 163 tech
569 1171 business
570 769 entertainment
571 77 politics
572 1252 sport
573 532 tech
574 1975 business
575 178 entertainment
576 1237 politics
577 1899 sport
578 676 tech
579 2008 business
580 1807 entertainment
581 481 politics
582 17 sport
583 837 tech
584 523 business
585 2099 entertainment
586 1957 politics
587 1470 sport
588 1151 tech
589 489 business
590 823 entertainment
591 1743 politics
592 1035 sport
593 2103 tech
594 1230 business
595 1636 entertainment
596 103 politics
597 927 sport
598 379 tech
599 1789 business
600 278 entertainment
601 887 politics
602 1300 sport
603 394 tech
604 625 business
605 1770 entertainment
606 1959 politics
607 1371 sport
608 1359 tech
609 921 business
610 404 entertainment
611 422 politics
612 503 sport
613 1773 tech
614 2174 business
615 1086 entertainment
616 1986 politics
617 2007 sport
618 13 tech
619 796 business
620 585 entertainment
621 1200 politics
622 642 sport
623 237 tech
624 1826 business
625 43 entertainment
626 1594 politics
627 594 sport
628 1084 tech
629 1859 business
630 524 entertainment
631 463 politics
632 1085 sport
633 668 tech
634 2132 business
635 1970 entertainment
636 260 politics
637 141 sport
638 468 tech
639 269 business
640 414 entertainment
641 2097 politics
642 1420 sport
643 1472 tech
644 1757 business
645 243 entertainment
646 1142 politics
647 1295 sport
648 1660 tech
649 1635 business
650 2200 entertainment
651 2146 politics
652 5 sport
653 1256 tech
654 2090 business
655 1816 entertainment
656 1991 politics
657 1310 sport
658 1162 tech
659 499 business
660 285 entertainment
661 1216 politics
662 745 sport
663 421 tech
664 2068 business
665 1145 entertainment
666 681 politics
667 556 sport
668 1063 tech
669 55 business
670 910 entertainment
671 2066 politics
672 1025 sport
673 2154 tech
674 64 business
675 2218 entertainment
676 2013 politics
677 1105 sport
678 669 tech
679 827 business
680 1335 entertainment
681 1426 politics
682 2196 sport
683 368 tech
684 1535 business
685 1672 entertainment
686 599 politics
687 2085 sport
688 1577 tech
689 724 business
690 568 entertainment
691 2018 politics
692 1543 sport
693 308 tech
694 1356 business
695 573 entertainment
696 1746 politics
697 1290 sport
698 272 tech
699 139 business
700 2191 entertainment
701 1255 politics
702 1699 sport
703 1616 tech
704 1619 business
705 1032 entertainment
706 287 politics
707 2194 sport
708 928 tech
709 584 business
710 1722 entertainment
711 1685 politics
712 1828 sport
713 692 tech
714 1393 business
715 540 entertainment
716 1317 politics
717 152 sport
718 650 tech
719 191 business
720 1719 entertainment
721 1564 politics
722 350 sport
723 116 tech
724 1000 business
725 36 entertainment
726 1225 politics
727 2155 sport
728 553 tech
729 551 business
730 1724 entertainment
731 1512 politics
732 1923 sport
733 373 tech
734 1704 business
735 206 entertainment
736 471 politics

736
data/BBC_News_Test.csv Normal file

File diff suppressed because one or more lines are too long

1491
data/BBC_News_Train.csv Normal file

File diff suppressed because one or more lines are too long

10
docker-compose.yaml Normal file
View File

@ -0,0 +1,10 @@
version: "3.9"
services:
ayct_core:
build:
context: .
dockerfile: Dockerfile
image: ayct_core:latest
ports:
- "4999:4999"
container_name: ayct_core

View File

@ -1,6 +0,0 @@
[build-system]
requires = [
"setuptools>=42",
"wheel"
]
build-backend = "setuptools.build_meta"

View File

@ -1,10 +0,0 @@
To build project type:
```
pip3 install virtualenv
virtuvalenv venv
source venv/bin/activate
pip3 install setuptools
python3 -m pip install --upgrade build
python3 -m build
```

6
requirements.txt Normal file
View File

@ -0,0 +1,6 @@
Flask==2.0.2
nltk==3.6.7
numpy==1.22.1
pandas==1.3.5
spacy==3.2.1
tqdm==4.62.3

View File

@ -1,24 +0,0 @@
[metadata]
name = pbrAyctCore
version = 0.0.1
author = Ramon Dyzman
author_email = ramon.dyzman@gmail.com
description = A small example package
long_description = file: readme.md
long_description_content_type = text/markdown
url = https://git.wmi.amu.edu.pl/s415366/pbr-ayct-core
project_urls =
Bug Tracker = https://github.com/pypa/sampleproject/issues
classifiers =
Programming Language :: Python :: 3
License :: OSI Approved :: MIT License
Operating System :: OS Independent
[options]
package_dir =
= src
packages = find:
python_requires = >=3.6
[options.packages.find]
where = src

View File

@ -1,4 +0,0 @@
def getTestString() -> str:
return "All You Can Tweet"

47
test.py Normal file
View File

@ -0,0 +1,47 @@
import pandas as pd
from nltk import word_tokenize
from utils import Vocabulary
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from utils import POC
from utils import NewsEntry
import torch
# def encode_dataset(data):
# return data.apply(lambda x: vocabulary.sentence_to_numbers(x))
# mode = "train"
# mode = "test"
mode = "generate"
prompt = """
Mr Johnson's bakerys is bad. My bakery is good.
"""
model_name = "model-4.pt"
train = pd.read_csv("data/BBC_News_Train.csv")
test = pd.read_csv("data/BBC_News_Test.csv")
length_limit = 1024
train = train[train["Text"].apply(lambda x: len(word_tokenize(x)) < length_limit)]
test = test[test["Text"].apply(lambda x: len(word_tokenize(x)) < length_limit)]
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')
if mode == "train":
dataset = NewsEntry(train["Text"], train["Text"], truncate=False, gpt2_type="gpt2")
poc = POC()
print("Starting training")
poc.train(dataset, model, tokenizer, output_dir="./", save_model_on_epoch=True)
elif mode == "generate":
# dataset = NewsEntry(test["Text"], test["Text"], truncate=False, gpt2_type="gpt2")
poc = POC()
model.load_state_dict(torch.load(model_name))
x = poc.generate(model, tokenizer, prompt, entry_count=1, entry_length=300)
print(x)
elif mode == "test":
dataset = NewsEntry(test["Text"], test["Text"], truncate=False, gpt2_type="gpt2")
poc = POC()
model.load_state_dict(torch.load(model_name))

206
utils.py Normal file
View File

@ -0,0 +1,206 @@
import spacy
import pandas as pd
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import numpy as np
import random
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm, trange
import torch.nn.functional as F
import csv
import os
nlp = spacy.load("en_core_web_sm")
class NewsEntry(Dataset):
def __init__(self, control_code, data, truncate=False, gpt2_type="gpt2", max_length=1024):
self.tokenizer = GPT2Tokenizer.from_pretrained(gpt2_type)
self.data = []
for row in data:
self.data.append(torch.tensor(
self.tokenizer.encode(f"<|{control_code}|>{row[:max_length]}<|endoftext|>")
))
if truncate:
self.data = self.data[:20000]
self.data_count = len(self.data)
def __len__(self):
return self.data_count
def __getitem__(self, item):
return self.data[item]
class Vocabulary():
def __init__(self) -> None:
self.__UKNOWN__ = -1
self.vocab = {}
def create_vocab(self, data):
counter = 0
vocab = {}
for row in data:
for word in nlp(row):
ex = word.lemma_
if ex in vocab:
pass
else:
vocab[ex] = counter
counter += 1
self.vocab = vocab
def word_to_number(self, word):
word = nlp(word)
for token in word:
ex = token.lemma_
if ex in self.vocab:
return self.vocab[ex]
else:
return self.__UKNOWN__
def sentence_to_numbers(self, seq):
result = []
for word in nlp(seq):
ex = word.lemma_
if ex in self.vocab:
result.append(self.vocab[ex])
else:
result.append(self.__UKNOWN__)
return result
def sequence_to_numbers(self, seq):
return [self.word_to_number(x) for x in seq]
def pack_tensor(new_tensor, packed_tensor, max_seq_len):
if packed_tensor is None:
return new_tensor, True, None
if new_tensor.size()[1] + packed_tensor.size()[1] > max_seq_len:
return packed_tensor, False, new_tensor
else:
packed_tensor = torch.cat([new_tensor, packed_tensor[:, 1:]], dim=1)
return packed_tensor, True, None
class POC():
def __init__(self) -> None:
pass
def train(
self, dataset, model, tokenizer,
batch_size=16, epochs=5, lr=2e-5,
max_seq_len=400, warmup_steps=200,
gpt2_type="gpt2", output_dir=".", output_prefix="model",
test_mode=False,save_model_on_epoch=False,
):
acc_steps = 100
device=torch.device("cuda")
model = model.cuda()
model.train()
optimizer = AdamW(model.parameters(), lr=lr)
scheduler = get_linear_schedule_with_warmup(
optimizer, num_warmup_steps=warmup_steps, num_training_steps=-1
)
train_dataloader = DataLoader(dataset, batch_size=1, shuffle=True)
loss=0
accumulating_batch_count = 0
input_tensor = None
for epoch in range(epochs):
print(f"Training epoch {epoch}")
print(loss)
for idx, entry in tqdm(enumerate(train_dataloader)):
(input_tensor, carry_on, remainder) = pack_tensor(entry, input_tensor, 768)
if carry_on and idx != len(train_dataloader) - 1:
continue
input_tensor = input_tensor.to(device)
outputs = model(input_tensor, labels=input_tensor)
loss = outputs[0]
loss.backward()
if (accumulating_batch_count % batch_size) == 0:
optimizer.step()
scheduler.step()
optimizer.zero_grad()
model.zero_grad()
accumulating_batch_count += 1
input_tensor = None
if save_model_on_epoch:
torch.save(
model.state_dict(),
os.path.join(output_dir, f"{output_prefix}-{epoch}.pt"),
)
return model
def generate(
self,
model,
tokenizer,
prompt,
entry_count=10,
entry_length=30, #maximum number of words
top_p=0.8,
temperature=1.,
):
model.eval()
generated_num = 0
generated_list = []
filter_value = -float("Inf")
with torch.no_grad():
for entry_idx in trange(entry_count):
entry_finished = False
generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
if entry_idx % 100 == 0:
print(entry_idx)
for i in range(entry_length):
outputs = model(generated, labels=generated)
loss, logits = outputs[:2]
logits = logits[:, -1, :] / (temperature if temperature > 0 else 1.0)
sorted_logits, sorted_indices = torch.sort(logits, descending=True)
cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
sorted_indices_to_remove = cumulative_probs > top_p
sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[
..., :-1
].clone()
sorted_indices_to_remove[..., 0] = 0
indices_to_remove = sorted_indices[sorted_indices_to_remove]
logits[:, indices_to_remove] = filter_value
next_token = torch.multinomial(F.softmax(logits, dim=-1), num_samples=1)
generated = torch.cat((generated, next_token), dim=1)
if next_token in tokenizer.encode("<|endoftext|>"):
entry_finished = True
if entry_finished:
generated_num = generated_num + 1
output_list = list(generated.squeeze().numpy())
output_text = tokenizer.decode(output_list)
generated_list.append(output_text)
break
if not entry_finished:
output_list = list(generated.squeeze().numpy())
output_text = f"{tokenizer.decode(output_list)}<|endoftext|>"
generated_list.append(output_text)
return generated_list