petite-difference-challenge2/run.py

41 lines
1.3 KiB
Python

from stop_words import get_stop_words
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import pandas as pd
import lzma
def read_file(f_name, f_ext=""):
if f_ext == "xz":
with lzma.open(f"{f_name}.{f_ext}") as file:
return [line.strip().decode("utf-8") for line in file.readlines()]
with open(f_name, encoding="utf-8") as file:
return [line.strip() for line in file.readlines()]
def write_file(dir_name, preds):
with open(f"{dir_name}/out.tsv", "a", encoding="utf-8") as file_out:
for val in preds:
file_out.writelines(f"{str(val)}")
x_data = read_file("train/in.tsv", "xz")
print(x_data[:5])
y_data = read_file("train/expected.tsv")
print(y_data[:5])
tfidf_vec = TfidfVectorizer(
use_idf=True, max_df=0.95, stop_words=get_stop_words("pl"))
x_vectorized = tfidf_vec.fit_transform(x_data)
model = LogisticRegression(max_iter=1000)
model.fit(x_vectorized, y_data)
# * Predictions
for data_dir in ["dev-0", "dev-1", "test-A"]:
print(f"running for the __{data_dir}__")
x_data = read_file(f"{data_dir}/in.tsv")
x_vectorized = tfidf_vec.transform(x_data)
y_predicted = model.predict(x_vectorized)
write_file(data_dir, y_predicted)