TwitterSentimentAnalysis/code.py

import pickle
import re
import pandas as pd
from nltk.tokenize import word_tokenize
from spacy import tokenizer
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English


def create_train(input,output1,output0):
    create_dictionary = pd.read_csv(input, delimiter="\t", header=None, names=["num","txt"], error_bad_lines=False, skip_blank_lines=False)
    create_dictionary = create_dictionary[:900000]
    values_1 = (create_dictionary['txt'].where(create_dictionary['num']==1)).dropna()
    values_0 = (create_dictionary['txt'].where(create_dictionary['num']==0)).dropna()
    values_1.to_csv(output1,header=None, index=None)
    values_0.to_csv(output0,header=None, index=None)


def tokenize(input,output):
    with open(input,'rt',encoding="utf8") as input_f, open(output,'w',encoding="utf-8") as file:
        for line in input_f:
            #text = line.lower()
            #text = re.sub(r'\\n+', " ", text)
            text = re.sub(r'http\S+', " ", line)
            #text = re.sub(r'\/[a-z]\/', " ", text)
            #text = re.sub(r'\s{2,}', " ", text)
            #text = re.sub(r'\W\w{1,3}\W|\A\w{1,3}\W', " ", text)
            #text = re.sub(r'\W\w{1,3}\W|\A\w{1,3}\W', " ", text)
            #text = re.sub(r'\W\w{1,3}\W|\A\w{1,3}\W', " ", text)
            #text = re.sub(r'^\s', " ", text)
            text = re.sub('[!@#$?.;)-:,/]', '', text)
            file.write(text)

create_train("train/train.tsv.gz","train/values_1.txt","train/values_0.txt")
tokenize("train/values_0.txt","train/values_0_tokenized.txt")
tokenize("train/values_1.txt","train/values_1_tokenized.txt")
tokenize("test-A/in.tsv","test-A/in_tokenized.tsv")
tokenize("dev-0/in.tsv","dev-0/in_tokenized.tsv")