This commit is contained in:
Artur Dylewski 2020-06-03 18:01:38 +02:00
parent d13443a750
commit 4a86e3878e
4 changed files with 15797 additions and 15696 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

101
w2v.py Normal file
View File

@ -0,0 +1,101 @@
import pandas as pd # our main data management package
import string # used for preprocessing
import re # used for preprocessing
import nltk # the Natural Language Toolkit, used for preprocessing
import numpy as np # used for managing NaNs
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords # used for preprocessing
from nltk.stem import WordNetLemmatizer # used for preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression # our model
from sklearn.model_selection import train_test_split
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
import gensim.models
def remove_urls(text):
new_text = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",text).split())
return new_text
# make all text lowercase
def text_lowercase(text):
return text.lower()
# remove numbers
def remove_numbers(text):
result = re.sub(r'\d+', '', text)
return result
# remove punctuation
def remove_punctuation(text):
translator = str.maketrans('', '', string.punctuation)
return text.translate(translator)
# tokenize
def tokenize(text):
text = word_tokenize(text)
return text
# remove stopwords
stop_words = set(stopwords.words('english'))
def remove_stopwords(text):
text = [i for i in text if not i in stop_words]
return text
# lemmatize
lemmatizer = WordNetLemmatizer()
def lemmatize(text):
text = [lemmatizer.lemmatize(token) for token in text]
return text
def preprocessing(text):
text = text_lowercase(text)
text = remove_urls(text)
text = remove_numbers(text)
text = remove_punctuation(text)
text = tokenize(text)
text = remove_stopwords(text)
text = lemmatize(text)
#text = ' '.join(text)
return text
def make_posts_list(in_file):
posts = []
with open(in_file) as f:
for line in f:
post = (line.split('\t')[0])
posts.append(preprocessing(post))
return posts
def make_exp_list(exp_file):
exp_list = []
with open(exp_file) as f:
for exp_line in f:
y = int(exp_line)
exp_list.append(y)
return exp_list
tokens = make_posts_list("train/in.tsv")
Y = make_exp_list("train/out.tsv")
model = gensim.models.Word2Vec(tokens, size=300, min_count=1, workers=4)
print("\n Training the word2vec model...\n")
# reducing the epochs will decrease the computation time
model.train(tokens, total_examples=len(tokens), epochs=4000)
clf = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial').fit(model.wv.syn0, Y)
tokens = make_posts_list("dev-0/in.tsv")
# Prediction of the first 15 samples of all features
predict = clf.predict(model.wv.syn0[:15, :])
# Calculating the score of the predictions
score = clf.score(model.wv.syn0, Y_dataset[:max_dataset_size])
print("\nPrediction word2vec : \n", predict)
print("Score word2vec : \n", score)