This commit is contained in:
MatOgr 2022-05-10 23:14:08 +02:00
parent c3ce71c113
commit 11867437bf
2 changed files with 488 additions and 284918 deletions

11
run.py
View File

@ -5,18 +5,15 @@ from sklearn.feature_extraction.text import TfidfVectorizer
# * Training data loading # * Training data loading
with open('train/in.tsv', 'r', encoding='utf-8') as f: with open('train/in.tsv', 'r', encoding='utf-8') as f:
x_train = pd.DataFrame([line.strip().split('\t') x_train = pd.DataFrame(f.readlines(), columns=['text'])
for line in f.readlines()], columns=['text', 'text_id'])
y_train = pd.read_csv('train/expected.tsv', sep='\t', y_train = pd.read_csv('train/expected.tsv', sep='\t',
names=['paranormal'], encoding='utf-8') names=['paranormal'], encoding='utf-8')
# *Validation data loading # *Validation data loading
with open('dev-0/in.tsv', 'r', encoding='utf-8') as f: with open('dev-0/in.tsv', 'r', encoding='utf-8') as f:
x_dev = pd.DataFrame([line.strip().split('\t') x_dev = pd.DataFrame(f.readlines(), columns=['text'])
for line in f.readlines()], columns=['text', 'text_id'])
# * Test data loading # * Test data loading
with open('train/in.tsv', 'r', encoding='utf-8') as f: with open('test-A/in.tsv', 'r', encoding='utf-8') as f:
x_test = pd.DataFrame([line.strip().split('\t') x_test = pd.DataFrame(f.readlines(), columns=['text'])
for line in f.readlines()], columns=['text', 'text_id'])
# * Training data preparation # * Training data preparation
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, max_features=500) tfidf_vectorizer = TfidfVectorizer(max_df=0.95, max_features=500)

File diff suppressed because it is too large Load Diff