This commit is contained in:
jakubknczny 2021-05-30 22:43:03 +02:00
parent 3fea4b5ee5
commit 812e8b75db
18 changed files with 98 additions and 5276 deletions

1
.gitignore vendored
View File

@ -6,3 +6,4 @@
*.o *.o
.DS_Store .DS_Store
.token .token
.idea

View File

@ -2,7 +2,7 @@
<project version="4"> <project version="4">
<component name="ProjectModuleManager"> <component name="ProjectModuleManager">
<modules> <modules>
<module fileurl="file://$PROJECT_DIR$/.idea/paranormal-or-skeptic-ISI-public.iml" filepath="$PROJECT_DIR$/.idea/paranormal-or-skeptic-ISI-public.iml" /> <module fileurl="file://$PROJECT_DIR$/.idea/log_reg_um.iml" filepath="$PROJECT_DIR$/.idea/log_reg_um.iml" />
</modules> </modules>
</component> </component>
</project> </project>

View File

@ -1,9 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="JAVA_MODULE" version="4">
<component name="NewModuleRootManager" inherit-compiler-output="true">
<exclude-output />
<content url="file://$MODULE_DIR$" />
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

View File

@ -0,0 +1,7 @@
:
.gitignore,a/5/a5cc2925ca8258af241be7e5b0381edf30266302
:
log_reg.py,8/d/8defb5199d336fa9e58cbe5b8c4833d0e222ccf9

View File

@ -1,13 +0,0 @@
Skeptic vs paranormal subreddits
================================
Classify a reddit as either from Skeptic subreddit or one of the
"paranormal" subreddits (Paranormal, UFOs, TheTruthIsHere, Ghosts,
,Glitch-in-the-Matrix, conspiracytheories).
Output label is the probability of a paranormal subreddit.
Sources
-------
Data taken from <https://archive.org/details/2015_reddit_comments_corpus>.

View File

@ -1 +0,0 @@
--metric Likelihood --metric Accuracy --metric F1 --metric F0:N<Precision> --metric F9999999:N<Recall> --precision 4 --in-header in-header.tsv --out-header out-header.tsv

View File

@ -16,7 +16,7 @@
1 1
0 0
0 0
0 1
1 1
1 1
0 0
@ -95,7 +95,7 @@
0 0
0 0
1 1
0 1
1 1
0 0
0 0
@ -123,7 +123,7 @@
0 0
0 0
0 0
0 1
0 0
0 0
0 0
@ -189,7 +189,7 @@
0 0
0 0
1 1
1 0
0 0
0 0
1 1
@ -321,7 +321,7 @@
0 0
1 1
0 0
0 1
1 1
0 0
0 0
@ -367,7 +367,7 @@
0 0
0 0
0 0
0 1
0 0
0 0
0 0
@ -425,7 +425,7 @@
1 1
0 0
0 0
0 1
0 0
1 1
1 1
@ -436,7 +436,7 @@
0 0
0 0
0 0
0 1
0 0
0 0
1 1
@ -497,7 +497,6 @@
0 0
0 0
0 0
0
1 1
0 0
0 0
@ -508,6 +507,7 @@
0 0
0 0
0 0
0
1 1
0 0
0 0
@ -529,7 +529,7 @@
0 0
0 0
1 1
0 1
0 0
0 0
0 0
@ -639,8 +639,8 @@
0 0
1 1
0 0
1 0
1 0
1 1
0 0
1 1
@ -652,7 +652,7 @@
1 1
0 0
1 1
0 1
1 1
1 1
0 0
@ -691,11 +691,11 @@
0 0
0 0
1 1
1
0 0
0 0
0 0
1 1
1
0 0
0 0
1 1
@ -782,7 +782,7 @@
1 1
0 0
1 1
0 1
0 0
1 1
0 0
@ -846,7 +846,7 @@
1 1
0 0
0 0
0 1
0 0
1 1
1 1
@ -1056,7 +1056,7 @@
0 0
1 1
0 0
1 0
0 0
0 0
0 0
@ -1067,7 +1067,7 @@
0 0
0 0
1 1
1 0
0 0
0 0
0 0
@ -1110,7 +1110,7 @@
0 0
0 0
0 0
1 0
0 0
0 0
1 1
@ -1311,13 +1311,13 @@
0 0
0 0
0 0
0
1 1
0 0
0 0
0 0
0 0
0 0
0
1 1
1 1
1 1
@ -1366,7 +1366,7 @@
0 0
0 0
0 0
0 1
1 1
0 0
0 0
@ -1395,7 +1395,7 @@
0 0
0 0
1 1
1 0
0 0
0 0
1 1
@ -1416,7 +1416,7 @@
0 0
0 0
0 0
1 0
1 1
0 0
0 0
@ -1435,7 +1435,7 @@
0 0
0 0
1 1
0 1
0 0
0 0
1 1
@ -1476,7 +1476,7 @@
0 0
0 0
0 0
1 0
1 1
0 0
0 0
@ -1504,7 +1504,7 @@
0 0
0 0
0 0
0 1
1 1
0 0
0 0
@ -1524,7 +1524,7 @@
0 0
1 1
0 0
0 1
0 0
0 0
1 1
@ -1739,7 +1739,7 @@
1 1
0 0
0 0
0 1
0 0
1 1
1 1
@ -1820,7 +1820,7 @@
0 0
0 0
0 0
0 1
0 0
0 0
0 0
@ -1848,7 +1848,7 @@
1 1
0 0
0 0
0 1
1 1
0 0
0 0
@ -1922,7 +1922,7 @@
1 1
0 0
0 0
0 1
0 0
0 0
1 1
@ -1995,7 +1995,7 @@
0 0
0 0
1 1
0 1
1 1
0 0
0 0
@ -2116,7 +2116,7 @@
0 0
0 0
1 1
1 0
1 1
0 0
0 0
@ -2182,12 +2182,12 @@
0 0
1 1
0 0
1
0 0
0 0
0 0
0 0
0 1
0
0 0
0 0
0 0
@ -2295,7 +2295,7 @@
0 0
0 0
0 0
0 1
0 0
0 0
0 0
@ -2520,7 +2520,7 @@
0 0
0 0
1 1
0 1
0 0
0 0
0 0
@ -2584,7 +2584,7 @@
0 0
0 0
0 0
0 1
0 0
0 0
1 1
@ -2660,13 +2660,13 @@
0 0
0 0
0 0
0 1
0 0
0 0
1 1
1 1
0 0
0 1
1 1
0 0
0 0
@ -2821,7 +2821,7 @@
0 0
0 0
0 0
1 0
1 1
0 0
0 0
@ -2893,7 +2893,7 @@
0 0
0 0
1 1
0 1
0 0
0 0
1 1
@ -2920,7 +2920,7 @@
0 0
1 1
0 0
1 0
0 0
1 1
1 1
@ -2961,14 +2961,14 @@
0 0
0 0
0 0
1
0 0
0 0
0 0
0 0
0 0
1
0 0
1
1
0 0
0 0
0 0
@ -2997,13 +2997,13 @@
1 1
1 1
0 0
0 1
1 1
0 0
0 0
0 0
1 1
0 1
0 0
0 0
0 0
@ -3092,7 +3092,7 @@
0 0
1 1
0 0
0 1
0 0
0 0
0 0
@ -3135,7 +3135,7 @@
0 0
0 0
1 1
1 0
1 1
0 0
0 0
@ -3179,7 +3179,7 @@
0 0
0 0
1 1
0 1
1 1
1 1
1 1
@ -3197,7 +3197,7 @@
0 0
0 0
0 0
0 1
0 0
1 1
0 0
@ -3259,7 +3259,7 @@
0 0
0 0
0 0
0 1
1 1
1 1
0 0
@ -3356,7 +3356,7 @@
0 0
0 0
0 0
0 1
0 0
0 0
0 0
@ -3400,14 +3400,14 @@
0 0
0 0
0 0
1
0 0
0 0
1
0 0
1
0 0
0 0
0 0
1
0 0
1 1
0 0
@ -3494,7 +3494,7 @@
0 0
1 1
0 0
0 1
0 0
0 0
0 0
@ -3632,7 +3632,7 @@
0 0
0 0
0 0
1 0
1 1
0 0
0 0
@ -3710,7 +3710,7 @@
0 0
0 0
1 1
0 1
0 0
0 0
0 0
@ -3754,7 +3754,7 @@
0 0
1 1
0 0
1 0
0 0
1 1
0 0
@ -3822,7 +3822,7 @@
0 0
1 1
0 0
0 1
0 0
1 1
1 1
@ -3879,7 +3879,7 @@
1 1
0 0
0 0
0 1
0 0
0 0
1 1
@ -3910,7 +3910,7 @@
0 0
1 1
0 0
1 0
0 0
0 0
0 0
@ -3966,7 +3966,7 @@
0 0
1 1
0 0
0 1
0 0
0 0
1 1
@ -4011,7 +4011,7 @@
1 1
0 0
0 0
0 1
0 0
0 0
1 1
@ -4208,7 +4208,7 @@
0 0
1 1
1 1
0 1
1 1
1 1
0 0
@ -4238,7 +4238,7 @@
0 0
0 0
0 0
1 0
0 0
1 1
1 1
@ -4277,7 +4277,7 @@
1 1
0 0
0 0
1 0
1 1
0 0
0 0
@ -4459,14 +4459,14 @@
1 1
0 0
0 0
1 0
1 1
0 0
0 0
0 0
1 1
0 0
0 1
0 0
0 0
0 0
@ -4495,11 +4495,11 @@
0 0
0 0
0 0
1
0 0
0 0
0 0
0 1
0
0 0
0 0
0 0
@ -4527,7 +4527,7 @@
0 0
1 1
1 1
0 1
0 0
1 1
0 0
@ -4537,7 +4537,7 @@
1 1
0 0
0 0
0 1
0 0
0 0
1 1
@ -4743,7 +4743,7 @@
0 0
0 0
0 0
1 0
0 0
0 0
0 0
@ -4895,9 +4895,9 @@
0 0
0 0
0 0
0
1 1
1 1
0
1 1
0 0
0 0
@ -4909,7 +4909,7 @@
0 0
0 0
1 1
0 1
0 0
0 0
0 0
@ -5057,7 +5057,7 @@
1 1
0 0
0 0
0 1
0 0
1 1
0 0
@ -5135,7 +5135,7 @@
0 0
0 0
0 0
1 0
1 1
0 0
0 0
@ -5162,7 +5162,7 @@
0 0
0 0
0 0
0 1
0 0
0 0
0 0
@ -5235,7 +5235,7 @@
1 1
1 1
0 0
0 1
1 1
1 1
1 1
1 0
16 1
17 0
18 0
19 0 1
20 1
21 1
22 0
95 0
96 0
97 1
98 0 1
99 1
100 0
101 0
123 0
124 0
125 0
126 0 1
127 0
128 0
129 0
189 0
190 0
191 1
192 1 0
193 0
194 0
195 1
321 0
322 1
323 0
324 0 1
325 1
326 0
327 0
367 0
368 0
369 0
370 0 1
371 0
372 0
373 0
425 1
426 0
427 0
428 0 1
429 0
430 1
431 1
436 0
437 0
438 0
439 0 1
440 0
441 0
442 1
497 0
498 0
499 0
0
500 1
501 0
502 0
507 0
508 0
509 0
510 0
511 1
512 0
513 0
529 0
530 0
531 1
532 0 1
533 0
534 0
535 0
639 0
640 1
641 0
642 1 0
643 1 0
644 1
645 0
646 1
652 1
653 0
654 1
655 0 1
656 1
657 1
658 0
691 0
692 0
693 1
1
694 0
695 0
696 0
697 1
698 1
699 0
700 0
701 1
782 1
783 0
784 1
785 0 1
786 0
787 1
788 0
846 1
847 0
848 0
849 0 1
850 0
851 1
852 1
1056 0
1057 1
1058 0
1059 1 0
1060 0
1061 0
1062 0
1067 0
1068 0
1069 1
1070 1 0
1071 0
1072 0
1073 0
1110 0
1111 0
1112 0
1113 1 0
1114 0
1115 0
1116 1
1311 0
1312 0
1313 0
0
1314 1
1315 0
1316 0
1317 0
1318 0
1319 0
1320 0
1321 1
1322 1
1323 1
1366 0
1367 0
1368 0
1369 0 1
1370 1
1371 0
1372 0
1395 0
1396 0
1397 1
1398 1 0
1399 0
1400 0
1401 1
1416 0
1417 0
1418 0
1419 1 0
1420 1
1421 0
1422 0
1435 0
1436 0
1437 1
1438 0 1
1439 0
1440 0
1441 1
1476 0
1477 0
1478 0
1479 1 0
1480 1
1481 0
1482 0
1504 0
1505 0
1506 0
1507 0 1
1508 1
1509 0
1510 0
1524 0
1525 1
1526 0
1527 0 1
1528 0
1529 0
1530 1
1739 1
1740 0
1741 0
1742 0 1
1743 0
1744 1
1745 1
1820 0
1821 0
1822 0
1823 0 1
1824 0
1825 0
1826 0
1848 1
1849 0
1850 0
1851 0 1
1852 1
1853 0
1854 0
1922 1
1923 0
1924 0
1925 0 1
1926 0
1927 0
1928 1
1995 0
1996 0
1997 1
1998 0 1
1999 1
2000 0
2001 0
2116 0
2117 0
2118 1
2119 1 0
2120 1
2121 0
2122 0
2182 0
2183 1
2184 0
2185 1
2186 0
2187 0
2188 0
2189 0
2190 0 1
0
2191 0
2192 0
2193 0
2295 0
2296 0
2297 0
2298 0 1
2299 0
2300 0
2301 0
2520 0
2521 0
2522 1
2523 0 1
2524 0
2525 0
2526 0
2584 0
2585 0
2586 0
2587 0 1
2588 0
2589 0
2590 1
2660 0
2661 0
2662 0
2663 0 1
2664 0
2665 0
2666 1
2667 1
2668 0
2669 0 1
2670 1
2671 0
2672 0
2821 0
2822 0
2823 0
2824 1 0
2825 1
2826 0
2827 0
2893 0
2894 0
2895 1
2896 0 1
2897 0
2898 0
2899 1
2920 0
2921 1
2922 0
2923 1 0
2924 0
2925 1
2926 1
2961 0
2962 0
2963 0
1
2964 0
2965 0
2966 0
2967 0
2968 0
1
2969 0
2970 1
2971 1
2972 0
2973 0
2974 0
2997 1
2998 1
2999 0
3000 0 1
3001 1
3002 0
3003 0
3004 0
3005 1
3006 0 1
3007 0
3008 0
3009 0
3092 0
3093 1
3094 0
3095 0 1
3096 0
3097 0
3098 0
3135 0
3136 0
3137 1
3138 1 0
3139 1
3140 0
3141 0
3179 0
3180 0
3181 1
3182 0 1
3183 1
3184 1
3185 1
3197 0
3198 0
3199 0
3200 0 1
3201 0
3202 1
3203 0
3259 0
3260 0
3261 0
3262 0 1
3263 1
3264 1
3265 0
3356 0
3357 0
3358 0
3359 0 1
3360 0
3361 0
3362 0
3400 0
3401 0
3402 0
1
3403 0
3404 0
1
3405 0
3406 1
3407 0
3408 0
3409 0
3410 1
3411 0
3412 1
3413 0
3494 0
3495 1
3496 0
3497 0 1
3498 0
3499 0
3500 0
3632 0
3633 0
3634 0
3635 1 0
3636 1
3637 0
3638 0
3710 0
3711 0
3712 1
3713 0 1
3714 0
3715 0
3716 0
3754 0
3755 1
3756 0
3757 1 0
3758 0
3759 1
3760 0
3822 0
3823 1
3824 0
3825 0 1
3826 0
3827 1
3828 1
3879 1
3880 0
3881 0
3882 0 1
3883 0
3884 0
3885 1
3910 0
3911 1
3912 0
3913 1 0
3914 0
3915 0
3916 0
3966 0
3967 1
3968 0
3969 0 1
3970 0
3971 0
3972 1
4011 1
4012 0
4013 0
4014 0 1
4015 0
4016 0
4017 1
4208 0
4209 1
4210 1
4211 0 1
4212 1
4213 1
4214 0
4238 0
4239 0
4240 0
4241 1 0
4242 0
4243 1
4244 1
4277 1
4278 0
4279 0
4280 1 0
4281 1
4282 0
4283 0
4459 1
4460 0
4461 0
4462 1 0
4463 1
4464 0
4465 0
4466 0
4467 1
4468 0
4469 0 1
4470 0
4471 0
4472 0
4495 0
4496 0
4497 0
4498 1
4499 0
4500 0
4501 0
4502 0 1
0
4503 0
4504 0
4505 0
4527 0
4528 1
4529 1
4530 0 1
4531 0
4532 1
4533 0
4537 1
4538 0
4539 0
4540 0 1
4541 0
4542 0
4543 1
4743 0
4744 0
4745 0
4746 1 0
4747 0
4748 0
4749 0
4895 0
4896 0
4897 0
0
4898 1
4899 1
4900 0
4901 1
4902 0
4903 0
4909 0
4910 0
4911 1
4912 0 1
4913 0
4914 0
4915 0
5057 1
5058 0
5059 0
5060 0 1
5061 0
5062 1
5063 0
5135 0
5136 0
5137 0
5138 1 0
5139 1
5140 0
5141 0
5162 0
5163 0
5164 0
5165 0 1
5166 0
5167 0
5168 0
5235 1
5236 1
5237 0
5238 0 1
5239 1
5240 1
5241 1

5
dev-stats.txt Normal file
View File

@ -0,0 +1,5 @@
Likelihood 0.0000
Accuracy 0.7627
F1.0 0.6495
Precision 0.6806
Recall 0.6211

View File

Can't render this file because it is too large.

BIN
geval

Binary file not shown.

View File

@ -1 +0,0 @@
PostText Timestamp
1 PostText Timestamp

View File

@ -23,23 +23,19 @@ def doc2vec(doc):
return np.mean([word2vec[word] for word in doc if word in word2vec] or [np.zeros(300)], axis=0) return np.mean([word2vec[word] for word in doc if word in word2vec] or [np.zeros(300)], axis=0)
x_train = pd.read_table('train/in.tsv.xz', compression='xz', sep='\t', header=None, error_bad_lines=False, quoting=3) x_train = pd.read_table('in-train.tsv.xz', compression='xz', sep='\t', header=None, error_bad_lines=False, quoting=3)
y_train = pd.read_table('train/expected.tsv', sep='\t', header=None, quoting=3) y_train = pd.read_table('expected-train.tsv', sep='\t', header=None, quoting=3)
x_dev = pd.read_table('dev-0/in.tsv.xz', compression='xz', sep='\t', header=None, quoting=3) x_dev = pd.read_table('in-dev.tsv.xz', compression='xz', sep='\t', header=None, quoting=3)
x_test = pd.read_table('test-A/in.tsv.xz', compression='xz', sep='\t', header=None, quoting=3)
y_train = y_train[0] y_train = y_train[0]
x_train = x_train[0].str.lower() x_train = x_train[0].str.lower()
x_train = [word_tokenize(x) for x in x_train] x_train = [word_tokenize(x) for x in x_train]
x_dev = x_dev[0].str.lower() x_dev = x_dev[0].str.lower()
x_dev = [word_tokenize(x) for x in x_dev] x_dev = [word_tokenize(x) for x in x_dev]
x_test = x_test[0].str.lower()
x_test = [word_tokenize(x) for x in x_test]
word2vec = gensim.load('word2vec-google-news-300') word2vec = gensim.load('word2vec-google-news-300')
x_train = [doc2vec(doc) for doc in x_train] x_train = [doc2vec(doc) for doc in x_train]
x_dev = [doc2vec(doc) for doc in x_dev] x_dev = [doc2vec(doc) for doc in x_dev]
x_test = [doc2vec(doc) for doc in x_test]
model = NeuralNetworkModel() model = NeuralNetworkModel()
BATCH_SIZE = 1024 BATCH_SIZE = 1024
@ -71,16 +67,6 @@ with torch.no_grad():
y = (outputs > 0.5) y = (outputs > 0.5)
y_dev.extend(y) y_dev.extend(y)
for i in range(0, len(x_test), BATCH_SIZE):
X = x_test[i:i + BATCH_SIZE]
X = torch.tensor(X)
outputs = model(X.float())
y = (outputs >= 0.5)
y_test.extend(y)
y_dev = np.asarray(y_dev, dtype=np.int32) y_dev = np.asarray(y_dev, dtype=np.int32)
Y_dev = pd.DataFrame({'label': y_dev}) Y_dev = pd.DataFrame({'label': y_dev})
Y_dev.to_csv(r'dev-0/out.tsv', sep='\t', index=False, header=False) Y_dev.to_csv(r'dev-out.tsv', sep='\t', index=False, header=False)
y_test = np.asarray(y_test, dtype=np.int32)
Y_test = pd.DataFrame({'label': y_test})
Y_test.to_csv(r'test-A/out.tsv', sep='\t', index=False, header=False)

View File

@ -1 +0,0 @@
Label
1 Label

Binary file not shown.

File diff suppressed because it is too large Load Diff