import pandas as pd import numpy as np from nltk.tokenize import word_tokenize from nltk import pos_tag from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer from sklearn.preprocessing import LabelEncoder from collections import defaultdict from nltk.corpus import wordnet as wn from sklearn.feature_extraction.text import TfidfVectorizer from sklearn import model_selection, naive_bayes, svm from sklearn.metrics import accuracy_score from sklearn.pipeline import make_pipeline with open("train/in.tsv") as f: x_train = f.readlines() with open("train/expected.tsv") as f: y_train = f.readlines() with open("dev-0/in.tsv") as f: x_dev = f.readlines() y_train = LabelEncoder().fit_transform(y_train) y_train pipeline = make_pipeline(TfidfVectorizer(),svm.SVC()) model = pipeline.fit(x_train, y_train) prediction = model.predict(x_dev) np.savetxt("svm/out.tsv", prediction, fmt='%d')