from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import LinearRegression import lzma def generate_out(folder_path): print('Generating out') X_dev = [] Y_dev = [] with open(f'{folder_path}/in.tsv', 'r') as file: for line in file: line = line.strip() X_dev.append(line) print("step 5") X_dev = vectorizer.transform(X_dev) prediction = model.predict(X_dev) print("step 6") f = open(f"{folder_path}/out.tsv", "a") for p in prediction: f.write(str(p) + '\n') f.close() if __name__ == "__main__": X = [] Y = [] with lzma.open('train/train.tsv.xz', 'r') as file: for line in file: line = line.strip() X.append(line.decode("utf-8")) print("step 1") with lzma.open('train/meta.tsv.xz', 'r') as file: for line in file: line = line.strip() line = line.decode("utf-8") Y.append(int(line.split('\t')[5])) print("step 2") vectorizer = TfidfVectorizer() X = vectorizer.fit_transform(X) print("step 3") model = LinearRegression() model.fit(X, Y) print("step 4") generate_out('dev-0') generate_out('dev-1') generate_out('test-A')