This commit is contained in:
korne 2022-06-15 00:07:45 +02:00
parent db6d196edb
commit f140a121a2
4 changed files with 3280 additions and 194 deletions

View File

@ -9,7 +9,7 @@
0
1
0
1
0
0
1
0
@ -41,7 +41,7 @@
0
0
0
0
1
0
0
1
@ -67,7 +67,7 @@
0
1
1
1
0
0
0
0
@ -175,7 +175,7 @@
0
1
1
1
0
0
0
0
@ -251,7 +251,7 @@
1
0
0
1
0
1
0
0
@ -439,7 +439,7 @@
0
0
0
0
1
1
0
0
@ -589,7 +589,7 @@
0
0
1
0
1
0
0
0
@ -657,7 +657,7 @@
1
0
0
1
0
1
0
0
@ -691,12 +691,12 @@
0
0
1
1
0
0
0
0
1
1
0
1
1
@ -761,7 +761,7 @@
0
0
1
0
1
1
1
0
@ -782,7 +782,7 @@
1
0
1
1
0
0
1
0
@ -790,7 +790,7 @@
0
1
1
1
0
1
0
0
@ -874,7 +874,7 @@
0
0
1
1
0
1
0
0
@ -965,14 +965,14 @@
0
0
1
0
1
1
0
0
0
0
0
1
0
0
1
0
@ -981,7 +981,7 @@
0
1
0
1
0
0
1
0
@ -1080,7 +1080,7 @@
0
0
0
1
0
0
0
1
@ -1265,7 +1265,7 @@
0
1
0
0
1
0
1
1
@ -1312,7 +1312,7 @@
0
0
0
0
1
0
0
0
@ -1414,7 +1414,7 @@
1
0
0
1
0
0
0
1
@ -1435,7 +1435,7 @@
0
0
1
0
1
0
0
1
@ -1529,7 +1529,7 @@
0
1
0
0
1
1
1
0
@ -1538,7 +1538,7 @@
0
1
0
1
0
1
0
1
@ -1873,7 +1873,7 @@
1
1
1
1
0
0
0
1
@ -1890,7 +1890,7 @@
0
0
1
0
1
1
0
0
@ -1971,7 +1971,7 @@
0
0
1
0
1
1
0
0
@ -2016,12 +2016,12 @@
0
1
0
1
0
1
1
1
0
1
0
0
0
1
@ -2305,7 +2305,7 @@
1
1
0
1
0
0
0
0
@ -2458,7 +2458,7 @@
0
0
0
0
1
0
0
0
@ -2595,7 +2595,7 @@
0
0
1
0
1
1
0
0
@ -2690,7 +2690,7 @@
0
1
1
0
1
0
0
0
@ -2728,11 +2728,11 @@
0
0
1
1
1
0
1
0
1
1
0
0
1
@ -2967,7 +2967,7 @@
0
0
0
0
1
1
0
0
@ -3009,7 +3009,7 @@
0
0
1
0
1
0
1
0
@ -3283,7 +3283,7 @@
0
0
1
0
1
0
0
0
@ -3391,7 +3391,7 @@
0
0
0
0
1
0
0
0
@ -3409,7 +3409,7 @@
0
0
0
0
1
0
0
1
@ -3564,7 +3564,7 @@
0
0
0
1
0
0
0
1
@ -3663,7 +3663,7 @@
0
0
0
1
0
0
0
0
@ -3811,7 +3811,7 @@
1
0
0
0
1
1
0
1
@ -3969,7 +3969,7 @@
0
0
0
1
0
0
0
1
@ -3985,7 +3985,7 @@
0
0
0
1
0
0
0
1
@ -3996,7 +3996,7 @@
1
0
0
1
0
0
0
0
@ -4037,7 +4037,7 @@
0
1
0
1
0
0
1
0
@ -4064,7 +4064,7 @@
1
0
0
1
0
0
0
1
@ -4110,8 +4110,8 @@
0
0
1
0
0
1
1
0
1
1
@ -4168,7 +4168,7 @@
1
1
0
0
1
0
0
0
@ -4208,7 +4208,7 @@
0
1
1
1
0
1
1
0
@ -4265,7 +4265,7 @@
0
0
0
0
1
0
1
1
@ -4312,7 +4312,7 @@
0
0
0
1
0
0
1
0
@ -4374,7 +4374,7 @@
1
0
0
0
1
0
0
0
@ -4400,7 +4400,7 @@
1
0
0
0
1
0
0
0
@ -4469,7 +4469,7 @@
0
0
0
1
0
0
1
1
@ -4497,7 +4497,7 @@
0
0
0
1
0
0
1
0
@ -4532,7 +4532,7 @@
1
1
1
1
0
0
0
0
@ -4810,7 +4810,7 @@
1
1
0
0
1
0
0
1
@ -5043,7 +5043,7 @@
1
1
0
1
0
0
0
1
@ -5178,7 +5178,7 @@
0
0
0
1
0
0
0
0
@ -5236,7 +5236,7 @@
1
0
1
1
0
1
1
0

1 0
9 0
10 1
11 0
12 1 0
13 0
14 1
15 0
41 0
42 0
43 0
44 0 1
45 0
46 0
47 1
67 0
68 1
69 1
70 1 0
71 0
72 0
73 0
175 0
176 1
177 1
178 1 0
179 0
180 0
181 0
251 1
252 0
253 0
254 1 0
255 1
256 0
257 0
439 0
440 0
441 0
442 0 1
443 1
444 0
445 0
589 0
590 0
591 1
592 0 1
593 0
594 0
595 0
657 1
658 0
659 0
660 1 0
661 1
662 0
663 0
691 0
692 0
693 1
1
694 0
695 0
696 0
697 0
698 1
699 1
700 0
701 1
702 1
761 0
762 0
763 1
764 0 1
765 1
766 1
767 0
782 1
783 0
784 1
785 1 0
786 0
787 1
788 0
790 0
791 1
792 1
793 1 0
794 1
795 0
796 0
874 0
875 0
876 1
877 1 0
878 1
879 0
880 0
965 0
966 0
967 1
968 0 1
969 1
970 0
971 0
972 0
973 0
974 0
975 1 0
976 0
977 1
978 0
981 0
982 1
983 0
984 1 0
985 0
986 1
987 0
1080 0
1081 0
1082 0
1083 1 0
1084 0
1085 0
1086 1
1265 0
1266 1
1267 0
1268 0 1
1269 0
1270 1
1271 1
1312 0
1313 0
1314 0
1315 0 1
1316 0
1317 0
1318 0
1414 1
1415 0
1416 0
1417 1 0
1418 0
1419 0
1420 1
1435 0
1436 0
1437 1
1438 0 1
1439 0
1440 0
1441 1
1529 0
1530 1
1531 0
1532 0 1
1533 1
1534 1
1535 0
1538 0
1539 1
1540 0
1541 1 0
1542 1
1543 0
1544 1
1873 1
1874 1
1875 1
1876 1 0
1877 0
1878 0
1879 1
1890 0
1891 0
1892 1
1893 0 1
1894 1
1895 0
1896 0
1971 0
1972 0
1973 1
1974 0 1
1975 1
1976 0
1977 0
2016 0
2017 1
2018 0
2019 1 0
2020 1
2021 1
2022 1
2023 0
2024 1 0
2025 0
2026 0
2027 1
2305 1
2306 1
2307 0
2308 1 0
2309 0
2310 0
2311 0
2458 0
2459 0
2460 0
2461 0 1
2462 0
2463 0
2464 0
2595 0
2596 0
2597 1
2598 0 1
2599 1
2600 0
2601 0
2690 0
2691 1
2692 1
2693 0 1
2694 0
2695 0
2696 0
2728 0
2729 0
2730 1
1
1
2731 0
2732 1
2733 0
2734 1
2735 1
2736 0
2737 0
2738 1
2967 0
2968 0
2969 0
2970 0 1
2971 1
2972 0
2973 0
3009 0
3010 0
3011 1
3012 0 1
3013 0
3014 1
3015 0
3283 0
3284 0
3285 1
3286 0 1
3287 0
3288 0
3289 0
3391 0
3392 0
3393 0
3394 0 1
3395 0
3396 0
3397 0
3409 0
3410 0
3411 0
3412 0 1
3413 0
3414 0
3415 1
3564 0
3565 0
3566 0
3567 1 0
3568 0
3569 0
3570 1
3663 0
3664 0
3665 0
3666 1 0
3667 0
3668 0
3669 0
3811 1
3812 0
3813 0
3814 0 1
3815 1
3816 0
3817 1
3969 0
3970 0
3971 0
3972 1 0
3973 0
3974 0
3975 1
3985 0
3986 0
3987 0
3988 1 0
3989 0
3990 0
3991 1
3996 1
3997 0
3998 0
3999 1 0
4000 0
4001 0
4002 0
4037 0
4038 1
4039 0
4040 1 0
4041 0
4042 1
4043 0
4064 1
4065 0
4066 0
4067 1 0
4068 0
4069 0
4070 1
4110 0
4111 0
4112 1
4113 0 1
4114 0 1
4115 0
4116 1
4117 1
4168 1
4169 1
4170 0
4171 0 1
4172 0
4173 0
4174 0
4208 0
4209 1
4210 1
4211 1 0
4212 1
4213 1
4214 0
4265 0
4266 0
4267 0
4268 0 1
4269 0
4270 1
4271 1
4312 0
4313 0
4314 0
4315 1 0
4316 0
4317 1
4318 0
4374 1
4375 0
4376 0
4377 0 1
4378 0
4379 0
4380 0
4400 1
4401 0
4402 0
4403 0 1
4404 0
4405 0
4406 0
4469 0
4470 0
4471 0
4472 1 0
4473 0
4474 1
4475 1
4497 0
4498 0
4499 0
4500 1 0
4501 0
4502 1
4503 0
4532 1
4533 1
4534 1
4535 1 0
4536 0
4537 0
4538 0
4810 1
4811 1
4812 0
4813 0 1
4814 0
4815 0
4816 1
5043 1
5044 1
5045 0
5046 1 0
5047 0
5048 0
5049 1
5178 0
5179 0
5180 0
5181 1 0
5182 0
5183 0
5184 0
5236 1
5237 0
5238 1
5239 1 0
5240 1
5241 1
5242 0

181
run.ipynb
View File

@ -16,47 +16,25 @@
"from nltk import word_tokenize"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cbe60d7b-850e-4838-b4ce-672f13bf2bb2",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 2,
"id": "bf211ece-e27a-4119-a1b9-9a9a610cfb46",
"id": "1ec57d97-a852-490e-8da4-d1e4c9676cd6",
"metadata": {},
"outputs": [],
"source": [
"def predict_year(x, path_out, model):\n",
" results = model.predict(x)\n",
" with open(path_out, 'wt') as file:\n",
" for r in results:\n",
" file.write(str(r) + '\\n') "
"#def read_file(filename):\n",
"# result = []\n",
"# with open(filename, 'r', encoding=\"utf-8\") as file:\n",
"# for line in file:\n",
"# text = line.split(\"\\t\")[0].strip()\n",
"# result.append(text)\n",
"# return result"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "1ec57d97-a852-490e-8da4-d1e4c9676cd6",
"metadata": {},
"outputs": [],
"source": [
"def read_file(filename):\n",
" result = []\n",
" with open(filename, 'r', encoding=\"utf-8\") as file:\n",
" for line in file:\n",
" text = line.split(\"\\t\")[0].strip()\n",
" result.append(text)\n",
" return result"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "86fbfb79-76e7-49f5-b722-2827f93cb03f",
"metadata": {},
"outputs": [
@ -163,7 +141,7 @@
"[200000 rows x 2 columns]"
]
},
"execution_count": 4,
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
@ -176,7 +154,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 4,
"id": "8960c975-f756-4e36-a1ce-e9fd5fdf8fe3",
"metadata": {},
"outputs": [
@ -271,7 +249,7 @@
"[200000 rows x 1 columns]"
]
},
"execution_count": 5,
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
@ -285,7 +263,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 5,
"id": "6b27e6ce-e9fd-41a1-aacf-53a5fde0a7c1",
"metadata": {},
"outputs": [
@ -392,7 +370,7 @@
"[5272 rows x 2 columns]"
]
},
"execution_count": 6,
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
@ -405,7 +383,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 6,
"id": "99ae526d-9b7c-493f-be4f-f95b1c8f4b81",
"metadata": {},
"outputs": [
@ -512,7 +490,7 @@
"[5152 rows x 2 columns]"
]
},
"execution_count": 7,
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
@ -525,7 +503,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 7,
"id": "dba17668-971f-47f8-99ce-fc840b5cb74a",
"metadata": {},
"outputs": [],
@ -546,7 +524,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 8,
"id": "1a275c1d-75bc-4290-9332-56396d16a0f2",
"metadata": {},
"outputs": [],
@ -558,15 +536,48 @@
"\n",
"x_train = [word_tokenize(x) for x in x_train]\n",
"x_dev = [word_tokenize(x) for x in x_dev]\n",
"x_test = [word_tokenize(x) for x in x_test]"
"x_test = [word_tokenize(x) for x in x_test]\n",
"#x_test"
]
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 9,
"id": "3125d2f2-0da9-45eb-acf1-90293c6d64a3",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"5152"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(x_test)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "031a3670-3be7-4146-97b4-0dacd4f9ae58",
"metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"text/plain": [
"5152"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from gensim.test.utils import common_texts\n",
"from gensim.models import Word2Vec\n",
@ -574,24 +585,16 @@
"word2vec = gensim.downloader.load('word2vec-google-news-300')\n",
"x_train = [np.mean([word2vec[word] for word in content if word in word2vec] or [np.zeros(300)], axis=0) for content in x_train]\n",
"x_dev = [np.mean([word2vec[word] for word in content if word in word2vec] or [np.zeros(300)], axis=0) for content in x_dev]\n",
"x_test = [np.mean([word2vec[word] for word in content if word in word2vec] or [np.zeros(300)], axis=0) for content in x_test]"
"x_test = [np.mean([word2vec[word] for word in content if word in word2vec] or [np.zeros(300)], axis=0) for content in x_test]\n",
"len(x_test)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 15,
"id": "b7defd18-e281-4cf6-9941-cee560749677",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\korne\\AppData\\Local\\Temp\\ipykernel_22024\\3484013121.py:10: UserWarning: Creating a tensor from a list of numpy.ndarrays is extremely slow. Please consider converting the list to a single numpy.ndarray with numpy.array() before converting to a tensor. (Triggered internally at C:\\actions-runner\\_work\\pytorch\\pytorch\\builder\\windows\\pytorch\\torch\\csrc\\utils\\tensor_new.cpp:210.)\n",
" X = torch.tensor(X)\n"
]
}
],
"outputs": [],
"source": [
"model = NeuralNetworkModel()\n",
"BATCH_SIZE = 5\n",
@ -614,10 +617,21 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 17,
"id": "92c69ddd-fe58-477f-b2c2-06324a983bcc",
"metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"text/plain": [
"5152"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"y_dev = []\n",
"y_test = []\n",
@ -635,60 +649,51 @@
" X = x_test[i:i + BATCH_SIZE]\n",
" X = torch.tensor(X)\n",
" outputs = model(X.float())\n",
" y = (outputs >= 0.5)\n",
" y_test += prediction.tolist()"
" prediction = (outputs >= 0.5)\n",
" y_test += prediction.tolist()\n",
"len(y_test)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 13,
"id": "caff921c-d0ab-4fce-a17f-6610266b404d",
"metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"text/plain": [
"2062"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"y_dev = np.asarray(y_dev, dtype=np.int32)\n",
"y_test = np.asarray(y_test, dtype=np.int32)"
"y_test = np.asarray(y_test, dtype=np.int32)\n",
"len(y_test)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": null,
"id": "73076eb2-810f-4f85-aa3f-05ee884c413b",
"metadata": {},
"outputs": [],
"source": [
"with open('./dev-0/out.tsv', 'wt') as file:\n",
" for r in y_dev:\n",
" file.write(str(r) + '\\n') "
"y_dev.tofile('./dev-0/out.tsv', sep='\\n')\n",
"y_test.tofile('./test-A/out.tsv', sep='\\n')"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "ddda251c-cafa-40f8-a020-48310a9f23b6",
"metadata": {},
"outputs": [],
"source": [
"with open('./test-A/out.tsv', 'wt') as file:\n",
" for r in y_test:\n",
" file.write(str(r) + '\\n') "
]
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": null,
"id": "5730562a-0200-4c8f-8f73-992fa2b36133",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[NbConvertApp] Converting notebook run.ipynb to script\n",
"[NbConvertApp] Writing 3816 bytes to run.py\n"
]
}
],
"outputs": [],
"source": [
"!jupyter nbconvert --to script run.ipynb"
]

79
run.py
View File

@ -13,43 +13,27 @@ import gensim.downloader
from nltk import word_tokenize
# In[ ]:
# In[2]:
def predict_year(x, path_out, model):
results = model.predict(x)
with open(path_out, 'wt') as file:
for r in results:
file.write(str(r) + '\n')
#def read_file(filename):
# result = []
# with open(filename, 'r', encoding="utf-8") as file:
# for line in file:
# text = line.split("\t")[0].strip()
# result.append(text)
# return result
# In[3]:
def read_file(filename):
result = []
with open(filename, 'r', encoding="utf-8") as file:
for line in file:
text = line.split("\t")[0].strip()
result.append(text)
return result
# In[4]:
x_train = pd.read_table('train/in.tsv', sep='\t', header=None, quoting=3)
x_train = x_train[0:200000]
x_train
# In[5]:
# In[4]:
with open('train/expected.tsv', 'r', encoding='utf8') as file:
@ -58,7 +42,7 @@ y_train = y_train[0:200000]
y_train
# In[6]:
# In[5]:
with open('dev-0/in.tsv', 'r', encoding='utf8') as file:
@ -66,7 +50,7 @@ with open('dev-0/in.tsv', 'r', encoding='utf8') as file:
x_dev
# In[7]:
# In[6]:
with open('test-A/in.tsv', 'r', encoding='utf8') as file:
@ -74,7 +58,7 @@ with open('test-A/in.tsv', 'r', encoding='utf8') as file:
x_test
# In[8]:
# In[7]:
class NeuralNetworkModel(torch.nn.Module):
@ -91,7 +75,7 @@ class NeuralNetworkModel(torch.nn.Module):
return x
# In[9]:
# In[8]:
x_train = x_train[0].str.lower()
@ -102,9 +86,16 @@ x_test = x_test[0].str.lower()
x_train = [word_tokenize(x) for x in x_train]
x_dev = [word_tokenize(x) for x in x_dev]
x_test = [word_tokenize(x) for x in x_test]
#x_test
# In[11]:
# In[9]:
len(x_test)
# In[10]:
from gensim.test.utils import common_texts
@ -114,9 +105,10 @@ word2vec = gensim.downloader.load('word2vec-google-news-300')
x_train = [np.mean([word2vec[word] for word in content if word in word2vec] or [np.zeros(300)], axis=0) for content in x_train]
x_dev = [np.mean([word2vec[word] for word in content if word in word2vec] or [np.zeros(300)], axis=0) for content in x_dev]
x_test = [np.mean([word2vec[word] for word in content if word in word2vec] or [np.zeros(300)], axis=0) for content in x_test]
len(x_test)
# In[ ]:
# In[15]:
model = NeuralNetworkModel()
@ -138,7 +130,7 @@ for epoch in range(BATCH_SIZE):
optimizer.step()
# In[ ]:
# In[17]:
y_dev = []
@ -157,31 +149,24 @@ with torch.no_grad():
X = x_test[i:i + BATCH_SIZE]
X = torch.tensor(X)
outputs = model(X.float())
y = (outputs >= 0.5)
prediction = (outputs >= 0.5)
y_test += prediction.tolist()
len(y_test)
# In[ ]:
# In[13]:
y_dev = np.asarray(y_dev, dtype=np.int32)
y_test = np.asarray(y_test, dtype=np.int32)
len(y_test)
# In[ ]:
with open('./dev-0/out.tsv', 'wt') as file:
for r in y_dev:
file.write(str(r) + '\n')
# In[ ]:
with open('./test-A/out.tsv', 'wt') as file:
for r in y_test:
file.write(str(r) + '\n')
y_dev.tofile('./dev-0/out.tsv', sep='\n')
y_test.tofile('./test-A/out.tsv', sep='\n')
# In[ ]:
@ -189,3 +174,9 @@ with open('./test-A/out.tsv', 'wt') as file:
get_ipython().system('jupyter nbconvert --to script run.ipynb')
# In[ ]:

File diff suppressed because it is too large Load Diff