TwitterSentimentAnalysis/code.py
2020-05-25 20:21:18 +02:00

38 lines
1.7 KiB
Python

import pickle
import re
import pandas as pd
from nltk.tokenize import word_tokenize
from spacy import tokenizer
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
def create_train(input,output1,output0):
create_dictionary = pd.read_csv(input, delimiter="\t", header=None, names=["num","txt"], error_bad_lines=False, skip_blank_lines=False)
create_dictionary = create_dictionary[:900000]
values_1 = (create_dictionary['txt'].where(create_dictionary['num']==1)).dropna()
values_0 = (create_dictionary['txt'].where(create_dictionary['num']==0)).dropna()
values_1.to_csv(output1,header=None, index=None)
values_0.to_csv(output0,header=None, index=None)
def tokenize(input,output):
with open(input,'rt',encoding="utf8") as input_f, open(output,'w',encoding="utf-8") as file:
for line in input_f:
#text = line.lower()
#text = re.sub(r'\\n+', " ", text)
text = re.sub(r'http\S+', " ", line)
#text = re.sub(r'\/[a-z]\/', " ", text)
#text = re.sub(r'\s{2,}', " ", text)
#text = re.sub(r'\W\w{1,3}\W|\A\w{1,3}\W', " ", text)
#text = re.sub(r'\W\w{1,3}\W|\A\w{1,3}\W', " ", text)
#text = re.sub(r'\W\w{1,3}\W|\A\w{1,3}\W', " ", text)
#text = re.sub(r'^\s', " ", text)
text = re.sub('[!@#$?.;)-:,/]', '', text)
file.write(text)
create_train("train/train.tsv.gz","train/values_1.txt","train/values_0.txt")
tokenize("train/values_0.txt","train/values_0_tokenized.txt")
tokenize("train/values_1.txt","train/values_1_tokenized.txt")
tokenize("test-A/in.tsv","test-A/in_tokenized.tsv")
tokenize("dev-0/in.tsv","dev-0/in_tokenized.tsv")