38 lines
1.7 KiB
Python
38 lines
1.7 KiB
Python
import pickle
|
|
import re
|
|
import pandas as pd
|
|
from nltk.tokenize import word_tokenize
|
|
from spacy import tokenizer
|
|
from spacy.tokenizer import Tokenizer
|
|
from spacy.lang.en import English
|
|
|
|
|
|
def create_train(input,output1,output0):
|
|
create_dictionary = pd.read_csv(input, delimiter="\t", header=None, names=["num","txt"], error_bad_lines=False, skip_blank_lines=False)
|
|
create_dictionary = create_dictionary[:900000]
|
|
values_1 = (create_dictionary['txt'].where(create_dictionary['num']==1)).dropna()
|
|
values_0 = (create_dictionary['txt'].where(create_dictionary['num']==0)).dropna()
|
|
values_1.to_csv(output1,header=None, index=None)
|
|
values_0.to_csv(output0,header=None, index=None)
|
|
|
|
|
|
def tokenize(input,output):
|
|
with open(input,'rt',encoding="utf8") as input_f, open(output,'w',encoding="utf-8") as file:
|
|
for line in input_f:
|
|
#text = line.lower()
|
|
#text = re.sub(r'\\n+', " ", text)
|
|
text = re.sub(r'http\S+', " ", line)
|
|
#text = re.sub(r'\/[a-z]\/', " ", text)
|
|
#text = re.sub(r'\s{2,}', " ", text)
|
|
#text = re.sub(r'\W\w{1,3}\W|\A\w{1,3}\W', " ", text)
|
|
#text = re.sub(r'\W\w{1,3}\W|\A\w{1,3}\W', " ", text)
|
|
#text = re.sub(r'\W\w{1,3}\W|\A\w{1,3}\W', " ", text)
|
|
#text = re.sub(r'^\s', " ", text)
|
|
text = re.sub('[!@#$?.;)-:,/]', '', text)
|
|
file.write(text)
|
|
|
|
create_train("train/train.tsv.gz","train/values_1.txt","train/values_0.txt")
|
|
tokenize("train/values_0.txt","train/values_0_tokenized.txt")
|
|
tokenize("train/values_1.txt","train/values_1_tokenized.txt")
|
|
tokenize("test-A/in.tsv","test-A/in_tokenized.tsv")
|
|
tokenize("dev-0/in.tsv","dev-0/in_tokenized.tsv") |