from stop_words import get_stop_words from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import LogisticRegression import pandas as pd import lzma def read_file(f_name, f_ext=""): if f_ext == "xz": with lzma.open(f"{f_name}.{f_ext}") as file: return [line.strip().decode("utf-8") for line in file.readlines()] with open(f_name, encoding="utf-8") as file: return [line.strip() for line in file.readlines()] def write_file(dir_name, preds): with open(f"{dir_name}/out.tsv", "a", encoding="utf-8") as file_out: for val in preds: file_out.writelines(f"{str(val)}") x_data = read_file("train/in.tsv", "xz") print(x_data[:5]) y_data = read_file("train/expected.tsv") print(y_data[:5]) tfidf_vec = TfidfVectorizer( use_idf=True, max_df=0.95, stop_words=get_stop_words("pl")) x_vectorized = tfidf_vec.fit_transform(x_data) model = LogisticRegression(max_iter=1000) model.fit(x_vectorized, y_data) # * Predictions for data_dir in ["dev-0", "dev-1", "test-A"]: print(f"running for the __{data_dir}__") x_data = read_file(f"{data_dir}/in.tsv") x_vectorized = tfidf_vec.transform(x_data) y_predicted = model.predict(x_vectorized) write_file(data_dir, y_predicted)