444421
This commit is contained in:
parent
b775a221e6
commit
24fbc9c1c2
137314
dev-0/out.tsv
Normal file
137314
dev-0/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
156606
dev-1/out.tsv
Normal file
156606
dev-1/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
149
run.py
Normal file
149
run.py
Normal file
@ -0,0 +1,149 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# coding: utf-8
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
from sklearn.linear_model import LogisticRegression
|
||||||
|
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
|
||||||
|
from sklearn.metrics import accuracy_score
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
unxz train/in.tsv.xz
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
f = open("train/in.tsv", "r", encoding='utf-8')
|
||||||
|
lines = f.readlines()
|
||||||
|
x_train = pd.DataFrame(lines)
|
||||||
|
x_train.rename(columns = {0 : 'text'}, inplace = True)
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
y_train = pd.read_csv('expected.tsv', sep='\t', names=['male'], encoding='utf-8')
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
tfidf_vectorizer=TfidfVectorizer(use_idf=True, max_df=0.95)
|
||||||
|
tfidf_vectorizer.fit_transform(x_train['text'].values)
|
||||||
|
x_train_prepared = tfidf_vectorizer.transform(x_train['text'].values)
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
model = LogisticRegression()
|
||||||
|
model.fit(x_train_prepared, y_train)
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
f = open("dev-0/in.tsv", "r", encoding='utf-8')
|
||||||
|
lines_dev_0 = f.readlines()
|
||||||
|
x_test_dev_0 = pd.DataFrame(lines_dev_0)
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
x_test_dev_0.rename(columns = {0 : 'text'}, inplace = True)
|
||||||
|
x_test_dev_0_prepared = tfidf_vectorizer.transform(x_test_dev_0['text'].values)
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
y_dev_0_expected = pd.read_csv('expected.tsv', sep='\t', names=['male'], encoding='utf-8')
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
y_dev_0_predicted = model.predict(x_test_dev_0_prepared)
|
||||||
|
|
||||||
|
file = open('out.tsv', 'w')
|
||||||
|
for y in y_dev_0_predicted:
|
||||||
|
file.write(f'{y}\n')
|
||||||
|
file.close()
|
||||||
|
|
||||||
|
accuracy_dev_0 = accuracy_score(y_expected, y_predicted)
|
||||||
|
accuracy_dev_0
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
f = open("dev-1/in.tsv", "r", encoding='utf-8')
|
||||||
|
lines_dev_1 = f.readlines()
|
||||||
|
f.close()
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
x_test_dev_1 = pd.DataFrame(lines_dev_1)
|
||||||
|
x_test_dev_1.rename(columns = {0 : 'text'}, inplace = True)
|
||||||
|
x_test_dev_1_prepared = tfidf_vectorizer.transform(x_test_dev_1['text'].values)
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
y_dev_1_expected = pd.read_csv('expected.tsv', sep='\t', names=['male'], encoding='utf-8')
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
y_dev_1_predicted = model.predict(x_test_dev_1_prepared)
|
||||||
|
|
||||||
|
file = open('out.tsv', 'w')
|
||||||
|
for y in y_dev_1_predicted:
|
||||||
|
file.write(f'{y}\n')
|
||||||
|
file.close()
|
||||||
|
|
||||||
|
accuracy_dev_1 = accuracy_score(y_dev_1_expected, y_dev_1_predicted)
|
||||||
|
accuracy_dev_1
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
f = open("test-A/in.tsv", "r", encoding='utf-8')
|
||||||
|
lines_testing = f.readlines()
|
||||||
|
f.close()
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
x_testing = pd.DataFrame(lines_testing)
|
||||||
|
x_testing.rename(columns = {0 : 'text'}, inplace = True)
|
||||||
|
x_testing_prepared = tfidf_vectorizer.transform(x_testing['text'].values)
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
y_testing_predicted = model.predict(x_testing_prepared)
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
file = open('out.tsv', 'w')
|
||||||
|
for y in y_testing_predicted:
|
||||||
|
file.write(f'{y}\n')
|
||||||
|
file.close()
|
||||||
|
|
134618
test-A/out.tsv
Normal file
134618
test-A/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user