Compare commits
No commits in common. "master" and "my-brilliant-branch" have entirely different histories.
master
...
my-brillia
4
CHANGELOG.md
Normal file
4
CHANGELOG.md
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
<a name="2.0.0"></a>
|
||||||
|
## 2.0.0 (2020-05-22)
|
||||||
|
|
||||||
|
* Switch to probabilities as the main metric
|
6
dev-0/.ipynb_checkpoints/model-checkpoint.ipynb
Normal file
6
dev-0/.ipynb_checkpoints/model-checkpoint.ipynb
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
{
|
||||||
|
"cells": [],
|
||||||
|
"metadata": {},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
3
dev-0/.jovianrc
Normal file
3
dev-0/.jovianrc
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
{
|
||||||
|
"notebooks": {}
|
||||||
|
}
|
274627
dev-0/out.tsv
274627
dev-0/out.tsv
File diff suppressed because it is too large
Load Diff
175
dev-0/run.ipynb
Normal file
175
dev-0/run.ipynb
Normal file
@ -0,0 +1,175 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 90,
|
||||||
|
"id": "7dc5e391",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import pandas as pd\n",
|
||||||
|
"import csv"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 91,
|
||||||
|
"id": "a0825c64",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"tsv_data = pd.read_csv('in.tsv', sep='\\t',header=None, quoting=csv.QUOTE_NONE)[0]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 139,
|
||||||
|
"id": "4b9092a6",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"expected = pd.read_csv('expected.tsv', sep='\\t',header=None)[0]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 94,
|
||||||
|
"id": "56c39aa1",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"137314\n",
|
||||||
|
"137314\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"print(len(expected))\n",
|
||||||
|
"print(len(tsv_data))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 158,
|
||||||
|
"id": "d7b300ca",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"male={'silnik', 'windows', 'gb', 'mb', 'mecz', 'pc', 'opony', 'apple', 'iphone', 'zwiastuny', 'hd', 'ubuntu', 'system', 'serwer'}\n",
|
||||||
|
"female={'ciąża', 'miesiączki', 'ciasto', 'ciąże', 'zadowolona', 'antykoncepcyjne', 'ginekologia', 'tabletki', 'porodzie', 'mąż', 'krwawienie', 'ciasta'}\n",
|
||||||
|
"male = {x[:6].lower() for x in male}\n",
|
||||||
|
"female = {x[:6].lower() for x in female}"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 159,
|
||||||
|
"id": "31b5864b",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"trimmed_docs=[]\n",
|
||||||
|
"for document in tsv_data:\n",
|
||||||
|
" new_doc=[]\n",
|
||||||
|
" for word in str(document).lower().split():\n",
|
||||||
|
" new_doc.append(word[:6])\n",
|
||||||
|
" trimmed_docs.append(new_doc)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 160,
|
||||||
|
"id": "c1f02d77",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"male_or_female=[]\n",
|
||||||
|
"for doc in trimmed_docs:\n",
|
||||||
|
" male_or_female.append((len(male&set(doc)), len(female&set(doc))))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 161,
|
||||||
|
"id": "6edfd944",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"answers=[]\n",
|
||||||
|
"for i in male_or_female:\n",
|
||||||
|
" if i[0]>i[1]:\n",
|
||||||
|
" answers.append(1)\n",
|
||||||
|
" else:\n",
|
||||||
|
" answers.append(0)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 162,
|
||||||
|
"id": "40369c2b",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"result=[]\n",
|
||||||
|
"for i in range(len(answers)):\n",
|
||||||
|
" if answers[i]==expected[i]:\n",
|
||||||
|
" result.append(1)\n",
|
||||||
|
" else:\n",
|
||||||
|
" result.append(0)\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 163,
|
||||||
|
"id": "e296921c",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Predykcja modelu wynosi 51.007909%\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"print(f'Predykcja modelu wynosi {sum(result)/len(result)*100:.6f}%')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 167,
|
||||||
|
"id": "fee431a4",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"df = pd.DataFrame(result)\n",
|
||||||
|
"df.to_csv('out.tsv', sep = '\\t')"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.9.7"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
46
dev-0/run.py
46
dev-0/run.py
@ -17,17 +17,21 @@ tsv_data = pd.read_csv('in.tsv', sep='\t',header=None, quoting=csv.QUOTE_NONE)[0
|
|||||||
# In[139]:
|
# In[139]:
|
||||||
|
|
||||||
|
|
||||||
#expected = pd.read_csv('expected.tsv', sep='\t',header=None)[0]
|
expected = pd.read_csv('expected.tsv', sep='\t',header=None)[0]
|
||||||
|
|
||||||
|
|
||||||
|
# In[94]:
|
||||||
|
|
||||||
|
|
||||||
|
print(len(expected))
|
||||||
|
print(len(tsv_data))
|
||||||
|
|
||||||
|
|
||||||
# In[158]:
|
# In[158]:
|
||||||
|
|
||||||
|
|
||||||
male={'silnik', 'windows', 'gb', 'mb', 'mecz', 'pc', 'opony', 'apple', 'iphone', 'zwiastuny', 'hd', 'ubuntu', 'system', 'serwer', 'piłka', 'metal'}
|
male={'silnik', 'windows', 'gb', 'mb', 'mecz', 'pc', 'opony', 'apple', 'iphone', 'zwiastuny', 'hd', 'ubuntu', 'system', 'serwer'}
|
||||||
female={'ciąża', 'miesiączki', 'ciasto', 'ciąże', 'zadowolona', 'antykoncepcyjne', 'ginekologia', 'tabletki', 'porodzie', 'mąż', 'krwawienie', 'ciasta', 'narzeczony', 'ślub'}
|
female={'ciąża', 'miesiączki', 'ciasto', 'ciąże', 'zadowolona', 'antykoncepcyjne', 'ginekologia', 'tabletki', 'porodzie', 'mąż', 'krwawienie', 'ciasta'}
|
||||||
male = {x[:6].lower() for x in male}
|
male = {x[:6].lower() for x in male}
|
||||||
female = {x[:6].lower() for x in female}
|
female = {x[:6].lower() for x in female}
|
||||||
|
|
||||||
@ -47,41 +51,41 @@ for document in tsv_data:
|
|||||||
|
|
||||||
|
|
||||||
male_or_female=[]
|
male_or_female=[]
|
||||||
|
|
||||||
for doc in trimmed_docs:
|
for doc in trimmed_docs:
|
||||||
male_or_female.append((len(male&set(doc)), len(female&set(doc))))
|
male_or_female.append((len(male&set(doc)), len(female&set(doc))))
|
||||||
|
|
||||||
doc_mean = sum(map(len, trimmed_docs))/float(len(trimmed_docs))
|
|
||||||
# In[161]:
|
# In[161]:
|
||||||
|
|
||||||
#print(doc_mean)
|
|
||||||
answers=[]
|
|
||||||
for i in range(len(male_or_female)):
|
|
||||||
if male_or_female[i][0]>male_or_female[i][1]:
|
|
||||||
answers.append(1)
|
|
||||||
elif male_or_female[i][0]<male_or_female[i][1]:
|
|
||||||
answers.append(0)
|
|
||||||
else:
|
|
||||||
if len(trimmed_docs[i]) < doc_mean:
|
|
||||||
answers.append(0)
|
|
||||||
else:
|
|
||||||
answers.append(1)
|
|
||||||
|
|
||||||
|
answers=[]
|
||||||
|
for i in male_or_female:
|
||||||
|
if i[0]>i[1]:
|
||||||
|
answers.append(1)
|
||||||
|
else:
|
||||||
|
answers.append(0)
|
||||||
|
|
||||||
|
|
||||||
# In[162]:
|
# In[162]:
|
||||||
|
|
||||||
"""
|
|
||||||
result=[]
|
result=[]
|
||||||
for i in range(len(answers)):
|
for i in range(len(answers)):
|
||||||
if answers[i]==expected[i]:
|
if answers[i]==expected[i]:
|
||||||
result.append(1)
|
result.append(1)
|
||||||
else:
|
else:
|
||||||
result.append(0)
|
result.append(0)
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
|
# In[163]:
|
||||||
|
|
||||||
df = pd.Series(answers)
|
|
||||||
df.to_csv('out.tsv', sep = '\t', index=False, header=False)
|
print(f'Predykcja modelu wynosi {sum(result)/len(result)*100:.6f}%')
|
||||||
|
|
||||||
|
|
||||||
|
# In[167]:
|
||||||
|
|
||||||
|
|
||||||
|
df = pd.DataFrame(result)
|
||||||
|
df.to_csv('out.tsv', sep = '\t')
|
||||||
|
|
||||||
|
@ -1 +0,0 @@
|
|||||||
,DESKTOP-QF2J2E3/riraa,DESKTOP-QF2J2E3,27.04.2022 01:17,file:///C:/Users/riraa/AppData/Roaming/LibreOffice/4;
|
|
@ -1 +0,0 @@
|
|||||||
,DESKTOP-QF2J2E3/riraa,DESKTOP-QF2J2E3,27.04.2022 01:16,file:///C:/Users/riraa/AppData/Roaming/LibreOffice/4;
|
|
156606
dev-1/out.tsv
156606
dev-1/out.tsv
File diff suppressed because it is too large
Load Diff
87
dev-1/run.py
87
dev-1/run.py
@ -1,87 +0,0 @@
|
|||||||
#!/usr/bin/env python
|
|
||||||
# coding: utf-8
|
|
||||||
|
|
||||||
# In[90]:
|
|
||||||
|
|
||||||
|
|
||||||
import pandas as pd
|
|
||||||
import csv
|
|
||||||
|
|
||||||
|
|
||||||
# In[91]:
|
|
||||||
|
|
||||||
|
|
||||||
tsv_data = pd.read_csv('in.tsv', sep='\t',header=None, quoting=csv.QUOTE_NONE)[0]
|
|
||||||
|
|
||||||
|
|
||||||
# In[139]:
|
|
||||||
|
|
||||||
|
|
||||||
#expected = pd.read_csv('expected.tsv', sep='\t',header=None)[0]
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# In[158]:
|
|
||||||
|
|
||||||
|
|
||||||
male={'silnik', 'windows', 'gb', 'mb', 'mecz', 'pc', 'opony', 'apple', 'iphone', 'zwiastuny', 'hd', 'ubuntu', 'system', 'serwer', 'piłka', 'metal'}
|
|
||||||
female={'ciąża', 'miesiączki', 'ciasto', 'ciąże', 'zadowolona', 'antykoncepcyjne', 'ginekologia', 'tabletki', 'porodzie', 'mąż', 'krwawienie', 'ciasta', 'narzeczony', 'ślub'}
|
|
||||||
male = {x[:6].lower() for x in male}
|
|
||||||
female = {x[:6].lower() for x in female}
|
|
||||||
|
|
||||||
|
|
||||||
# In[159]:
|
|
||||||
|
|
||||||
|
|
||||||
trimmed_docs=[]
|
|
||||||
for document in tsv_data:
|
|
||||||
new_doc=[]
|
|
||||||
for word in str(document).lower().split():
|
|
||||||
new_doc.append(word[:6])
|
|
||||||
trimmed_docs.append(new_doc)
|
|
||||||
|
|
||||||
|
|
||||||
# In[160]:
|
|
||||||
|
|
||||||
|
|
||||||
male_or_female=[]
|
|
||||||
|
|
||||||
for doc in trimmed_docs:
|
|
||||||
male_or_female.append((len(male&set(doc)), len(female&set(doc))))
|
|
||||||
|
|
||||||
doc_mean = sum(map(len, trimmed_docs))/float(len(trimmed_docs))
|
|
||||||
# In[161]:
|
|
||||||
|
|
||||||
#print(doc_mean)
|
|
||||||
answers=[]
|
|
||||||
for i in range(len(male_or_female)):
|
|
||||||
if male_or_female[i][0]>male_or_female[i][1]:
|
|
||||||
answers.append(1)
|
|
||||||
elif male_or_female[i][0]<male_or_female[i][1]:
|
|
||||||
answers.append(0)
|
|
||||||
else:
|
|
||||||
if len(trimmed_docs[i]) < doc_mean:
|
|
||||||
answers.append(0)
|
|
||||||
else:
|
|
||||||
answers.append(1)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# In[162]:
|
|
||||||
|
|
||||||
"""
|
|
||||||
result=[]
|
|
||||||
for i in range(len(answers)):
|
|
||||||
if answers[i]==expected[i]:
|
|
||||||
result.append(1)
|
|
||||||
else:
|
|
||||||
result.append(0)
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
df = pd.Series(answers)
|
|
||||||
df.to_csv('out.tsv', sep = '\t', index=False, header=False)
|
|
||||||
|
|
87
run.py
87
run.py
@ -1,87 +0,0 @@
|
|||||||
#!/usr/bin/env python
|
|
||||||
# coding: utf-8
|
|
||||||
|
|
||||||
# In[90]:
|
|
||||||
|
|
||||||
|
|
||||||
import pandas as pd
|
|
||||||
import csv
|
|
||||||
|
|
||||||
|
|
||||||
# In[91]:
|
|
||||||
|
|
||||||
|
|
||||||
tsv_data = pd.read_csv('in.tsv', sep='\t',header=None, quoting=csv.QUOTE_NONE)[0]
|
|
||||||
|
|
||||||
|
|
||||||
# In[139]:
|
|
||||||
|
|
||||||
|
|
||||||
#expected = pd.read_csv('expected.tsv', sep='\t',header=None)[0]
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# In[158]:
|
|
||||||
|
|
||||||
|
|
||||||
male={'silnik', 'windows', 'gb', 'mb', 'mecz', 'pc', 'opony', 'apple', 'iphone', 'zwiastuny', 'hd', 'ubuntu', 'system', 'serwer', 'piłka', 'metal'}
|
|
||||||
female={'ciąża', 'miesiączki', 'ciasto', 'ciąże', 'zadowolona', 'antykoncepcyjne', 'ginekologia', 'tabletki', 'porodzie', 'mąż', 'krwawienie', 'ciasta', 'narzeczony', 'ślub'}
|
|
||||||
male = {x[:6].lower() for x in male}
|
|
||||||
female = {x[:6].lower() for x in female}
|
|
||||||
|
|
||||||
|
|
||||||
# In[159]:
|
|
||||||
|
|
||||||
|
|
||||||
trimmed_docs=[]
|
|
||||||
for document in tsv_data:
|
|
||||||
new_doc=[]
|
|
||||||
for word in str(document).lower().split():
|
|
||||||
new_doc.append(word[:6])
|
|
||||||
trimmed_docs.append(new_doc)
|
|
||||||
|
|
||||||
|
|
||||||
# In[160]:
|
|
||||||
|
|
||||||
|
|
||||||
male_or_female=[]
|
|
||||||
|
|
||||||
for doc in trimmed_docs:
|
|
||||||
male_or_female.append((len(male&set(doc)), len(female&set(doc))))
|
|
||||||
|
|
||||||
doc_mean = sum(map(len, trimmed_docs))/float(len(trimmed_docs))
|
|
||||||
# In[161]:
|
|
||||||
|
|
||||||
#print(doc_mean)
|
|
||||||
answers=[]
|
|
||||||
for i in range(len(male_or_female)):
|
|
||||||
if male_or_female[i][0]>male_or_female[i][1]:
|
|
||||||
answers.append(1)
|
|
||||||
elif male_or_female[i][0]<male_or_female[i][1]:
|
|
||||||
answers.append(0)
|
|
||||||
else:
|
|
||||||
if len(trimmed_docs[i]) < doc_mean:
|
|
||||||
answers.append(0)
|
|
||||||
else:
|
|
||||||
answers.append(1)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# In[162]:
|
|
||||||
|
|
||||||
"""
|
|
||||||
result=[]
|
|
||||||
for i in range(len(answers)):
|
|
||||||
if answers[i]==expected[i]:
|
|
||||||
result.append(1)
|
|
||||||
else:
|
|
||||||
result.append(0)
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
df = pd.Series(answers)
|
|
||||||
df.to_csv('out.tsv', sep = '\t', index=False, header=False)
|
|
||||||
|
|
134618
test-A/out.tsv
134618
test-A/out.tsv
File diff suppressed because it is too large
Load Diff
@ -1,87 +0,0 @@
|
|||||||
#!/usr/bin/env python
|
|
||||||
# coding: utf-8
|
|
||||||
|
|
||||||
# In[90]:
|
|
||||||
|
|
||||||
|
|
||||||
import pandas as pd
|
|
||||||
import csv
|
|
||||||
|
|
||||||
|
|
||||||
# In[91]:
|
|
||||||
|
|
||||||
|
|
||||||
tsv_data = pd.read_csv('in.tsv', sep='\t',header=None, quoting=csv.QUOTE_NONE)[0]
|
|
||||||
|
|
||||||
|
|
||||||
# In[139]:
|
|
||||||
|
|
||||||
|
|
||||||
#expected = pd.read_csv('expected.tsv', sep='\t',header=None)[0]
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# In[158]:
|
|
||||||
|
|
||||||
|
|
||||||
male={'silnik', 'windows', 'gb', 'mb', 'mecz', 'pc', 'opony', 'apple', 'iphone', 'zwiastuny', 'hd', 'ubuntu', 'system', 'serwer', 'piłka', 'metal'}
|
|
||||||
female={'ciąża', 'miesiączki', 'ciasto', 'ciąże', 'zadowolona', 'antykoncepcyjne', 'ginekologia', 'tabletki', 'porodzie', 'mąż', 'krwawienie', 'ciasta', 'narzeczony', 'ślub'}
|
|
||||||
male = {x[:6].lower() for x in male}
|
|
||||||
female = {x[:6].lower() for x in female}
|
|
||||||
|
|
||||||
|
|
||||||
# In[159]:
|
|
||||||
|
|
||||||
|
|
||||||
trimmed_docs=[]
|
|
||||||
for document in tsv_data:
|
|
||||||
new_doc=[]
|
|
||||||
for word in str(document).lower().split():
|
|
||||||
new_doc.append(word[:6])
|
|
||||||
trimmed_docs.append(new_doc)
|
|
||||||
|
|
||||||
|
|
||||||
# In[160]:
|
|
||||||
|
|
||||||
|
|
||||||
male_or_female=[]
|
|
||||||
|
|
||||||
for doc in trimmed_docs:
|
|
||||||
male_or_female.append((len(male&set(doc)), len(female&set(doc))))
|
|
||||||
|
|
||||||
doc_mean = sum(map(len, trimmed_docs))/float(len(trimmed_docs))
|
|
||||||
# In[161]:
|
|
||||||
|
|
||||||
#print(doc_mean)
|
|
||||||
answers=[]
|
|
||||||
for i in range(len(male_or_female)):
|
|
||||||
if male_or_female[i][0]>male_or_female[i][1]:
|
|
||||||
answers.append(1)
|
|
||||||
elif male_or_female[i][0]<male_or_female[i][1]:
|
|
||||||
answers.append(0)
|
|
||||||
else:
|
|
||||||
if len(trimmed_docs[i]) < doc_mean:
|
|
||||||
answers.append(0)
|
|
||||||
else:
|
|
||||||
answers.append(1)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# In[162]:
|
|
||||||
|
|
||||||
"""
|
|
||||||
result=[]
|
|
||||||
for i in range(len(answers)):
|
|
||||||
if answers[i]==expected[i]:
|
|
||||||
result.append(1)
|
|
||||||
else:
|
|
||||||
result.append(0)
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
df = pd.Series(answers)
|
|
||||||
df.to_csv('out.tsv', sep = '\t', index=False, header=False)
|
|
||||||
|
|
87
train/run.py
87
train/run.py
@ -1,87 +0,0 @@
|
|||||||
#!/usr/bin/env python
|
|
||||||
# coding: utf-8
|
|
||||||
|
|
||||||
# In[90]:
|
|
||||||
|
|
||||||
|
|
||||||
import pandas as pd
|
|
||||||
import csv
|
|
||||||
|
|
||||||
|
|
||||||
# In[91]:
|
|
||||||
|
|
||||||
|
|
||||||
tsv_data = pd.read_csv('in.tsv', sep='\t',header=None, quoting=csv.QUOTE_NONE)[0]
|
|
||||||
|
|
||||||
|
|
||||||
# In[139]:
|
|
||||||
|
|
||||||
|
|
||||||
#expected = pd.read_csv('expected.tsv', sep='\t',header=None)[0]
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# In[158]:
|
|
||||||
|
|
||||||
|
|
||||||
male={'silnik', 'windows', 'gb', 'mb', 'mecz', 'pc', 'opony', 'apple', 'iphone', 'zwiastuny', 'hd', 'ubuntu', 'system', 'serwer', 'piłka', 'metal'}
|
|
||||||
female={'ciąża', 'miesiączki', 'ciasto', 'ciąże', 'zadowolona', 'antykoncepcyjne', 'ginekologia', 'tabletki', 'porodzie', 'mąż', 'krwawienie', 'ciasta', 'narzeczony', 'ślub'}
|
|
||||||
male = {x[:6].lower() for x in male}
|
|
||||||
female = {x[:6].lower() for x in female}
|
|
||||||
|
|
||||||
|
|
||||||
# In[159]:
|
|
||||||
|
|
||||||
|
|
||||||
trimmed_docs=[]
|
|
||||||
for document in tsv_data:
|
|
||||||
new_doc=[]
|
|
||||||
for word in str(document).lower().split():
|
|
||||||
new_doc.append(word[:6])
|
|
||||||
trimmed_docs.append(new_doc)
|
|
||||||
|
|
||||||
|
|
||||||
# In[160]:
|
|
||||||
|
|
||||||
|
|
||||||
male_or_female=[]
|
|
||||||
|
|
||||||
for doc in trimmed_docs:
|
|
||||||
male_or_female.append((len(male&set(doc)), len(female&set(doc))))
|
|
||||||
|
|
||||||
doc_mean = sum(map(len, trimmed_docs))/float(len(trimmed_docs))
|
|
||||||
# In[161]:
|
|
||||||
|
|
||||||
#print(doc_mean)
|
|
||||||
answers=[]
|
|
||||||
for i in range(len(male_or_female)):
|
|
||||||
if male_or_female[i][0]>male_or_female[i][1]:
|
|
||||||
answers.append(1)
|
|
||||||
elif male_or_female[i][0]<male_or_female[i][1]:
|
|
||||||
answers.append(0)
|
|
||||||
else:
|
|
||||||
if len(trimmed_docs[i]) < doc_mean:
|
|
||||||
answers.append(0)
|
|
||||||
else:
|
|
||||||
answers.append(1)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# In[162]:
|
|
||||||
|
|
||||||
"""
|
|
||||||
result=[]
|
|
||||||
for i in range(len(answers)):
|
|
||||||
if answers[i]==expected[i]:
|
|
||||||
result.append(1)
|
|
||||||
else:
|
|
||||||
result.append(0)
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
df = pd.Series(answers)
|
|
||||||
df.to_csv('out.tsv', sep = '\t', index=False, header=False)
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user