#!/usr/bin/env python # coding: utf-8 # In[ ]: import pandas as pd from sklearn.linear_model import LogisticRegression from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer from sklearn.metrics import accuracy_score # In[ ]: unxz train/in.tsv.xz # In[ ]: f = open("train/in.tsv", "r", encoding='utf-8') lines = f.readlines() x_train = pd.DataFrame(lines) x_train.rename(columns = {0 : 'text'}, inplace = True) # In[ ]: y_train = pd.read_csv('expected.tsv', sep='\t', names=['male'], encoding='utf-8') # In[ ]: tfidf_vectorizer=TfidfVectorizer(use_idf=True, max_df=0.95) tfidf_vectorizer.fit_transform(x_train['text'].values) x_train_prepared = tfidf_vectorizer.transform(x_train['text'].values) # In[ ]: model = LogisticRegression() model.fit(x_train_prepared, y_train) # In[ ]: f = open("dev-0/in.tsv", "r", encoding='utf-8') lines_dev_0 = f.readlines() x_test_dev_0 = pd.DataFrame(lines_dev_0) # In[ ]: x_test_dev_0.rename(columns = {0 : 'text'}, inplace = True) x_test_dev_0_prepared = tfidf_vectorizer.transform(x_test_dev_0['text'].values) # In[ ]: y_dev_0_expected = pd.read_csv('expected.tsv', sep='\t', names=['male'], encoding='utf-8') # In[ ]: y_dev_0_predicted = model.predict(x_test_dev_0_prepared) file = open('out.tsv', 'w') for y in y_dev_0_predicted: file.write(f'{y}\n') file.close() accuracy_dev_0 = accuracy_score(y_expected, y_predicted) accuracy_dev_0 # In[ ]: f = open("dev-1/in.tsv", "r", encoding='utf-8') lines_dev_1 = f.readlines() f.close() # In[ ]: x_test_dev_1 = pd.DataFrame(lines_dev_1) x_test_dev_1.rename(columns = {0 : 'text'}, inplace = True) x_test_dev_1_prepared = tfidf_vectorizer.transform(x_test_dev_1['text'].values) # In[ ]: y_dev_1_expected = pd.read_csv('expected.tsv', sep='\t', names=['male'], encoding='utf-8') # In[ ]: y_dev_1_predicted = model.predict(x_test_dev_1_prepared) file = open('out.tsv', 'w') for y in y_dev_1_predicted: file.write(f'{y}\n') file.close() accuracy_dev_1 = accuracy_score(y_dev_1_expected, y_dev_1_predicted) accuracy_dev_1 # In[ ]: f = open("test-A/in.tsv", "r", encoding='utf-8') lines_testing = f.readlines() f.close() # In[ ]: x_testing = pd.DataFrame(lines_testing) x_testing.rename(columns = {0 : 'text'}, inplace = True) x_testing_prepared = tfidf_vectorizer.transform(x_testing['text'].values) # In[ ]: y_testing_predicted = model.predict(x_testing_prepared) # In[ ]: file = open('out.tsv', 'w') for y in y_testing_predicted: file.write(f'{y}\n') file.close()