add w2v
This commit is contained in:
parent
d13443a750
commit
4a86e3878e
10544
dev-0/expected.tsv
10544
dev-0/expected.tsv
File diff suppressed because it is too large
Load Diff
10544
dev-0/out.tsv
10544
dev-0/out.tsv
File diff suppressed because it is too large
Load Diff
10304
test-A/out.tsv
10304
test-A/out.tsv
File diff suppressed because it is too large
Load Diff
101
w2v.py
Normal file
101
w2v.py
Normal file
@ -0,0 +1,101 @@
|
|||||||
|
import pandas as pd # our main data management package
|
||||||
|
import string # used for preprocessing
|
||||||
|
import re # used for preprocessing
|
||||||
|
import nltk # the Natural Language Toolkit, used for preprocessing
|
||||||
|
import numpy as np # used for managing NaNs
|
||||||
|
import nltk
|
||||||
|
from nltk.tokenize import word_tokenize
|
||||||
|
from nltk.corpus import stopwords # used for preprocessing
|
||||||
|
from nltk.stem import WordNetLemmatizer # used for preprocessing
|
||||||
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
|
from sklearn.linear_model import LogisticRegression # our model
|
||||||
|
from sklearn.model_selection import train_test_split
|
||||||
|
|
||||||
|
nltk.download('stopwords')
|
||||||
|
nltk.download('wordnet')
|
||||||
|
nltk.download('punkt')
|
||||||
|
import gensim.models
|
||||||
|
|
||||||
|
def remove_urls(text):
|
||||||
|
new_text = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",text).split())
|
||||||
|
return new_text
|
||||||
|
|
||||||
|
# make all text lowercase
|
||||||
|
def text_lowercase(text):
|
||||||
|
return text.lower()
|
||||||
|
|
||||||
|
# remove numbers
|
||||||
|
def remove_numbers(text):
|
||||||
|
result = re.sub(r'\d+', '', text)
|
||||||
|
return result
|
||||||
|
|
||||||
|
# remove punctuation
|
||||||
|
def remove_punctuation(text):
|
||||||
|
translator = str.maketrans('', '', string.punctuation)
|
||||||
|
return text.translate(translator)
|
||||||
|
|
||||||
|
# tokenize
|
||||||
|
def tokenize(text):
|
||||||
|
text = word_tokenize(text)
|
||||||
|
return text
|
||||||
|
|
||||||
|
# remove stopwords
|
||||||
|
stop_words = set(stopwords.words('english'))
|
||||||
|
def remove_stopwords(text):
|
||||||
|
text = [i for i in text if not i in stop_words]
|
||||||
|
return text
|
||||||
|
|
||||||
|
# lemmatize
|
||||||
|
lemmatizer = WordNetLemmatizer()
|
||||||
|
def lemmatize(text):
|
||||||
|
text = [lemmatizer.lemmatize(token) for token in text]
|
||||||
|
return text
|
||||||
|
|
||||||
|
def preprocessing(text):
|
||||||
|
text = text_lowercase(text)
|
||||||
|
text = remove_urls(text)
|
||||||
|
text = remove_numbers(text)
|
||||||
|
text = remove_punctuation(text)
|
||||||
|
text = tokenize(text)
|
||||||
|
text = remove_stopwords(text)
|
||||||
|
text = lemmatize(text)
|
||||||
|
#text = ' '.join(text)
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
def make_posts_list(in_file):
|
||||||
|
posts = []
|
||||||
|
with open(in_file) as f:
|
||||||
|
for line in f:
|
||||||
|
post = (line.split('\t')[0])
|
||||||
|
posts.append(preprocessing(post))
|
||||||
|
return posts
|
||||||
|
|
||||||
|
|
||||||
|
def make_exp_list(exp_file):
|
||||||
|
exp_list = []
|
||||||
|
with open(exp_file) as f:
|
||||||
|
for exp_line in f:
|
||||||
|
y = int(exp_line)
|
||||||
|
exp_list.append(y)
|
||||||
|
|
||||||
|
return exp_list
|
||||||
|
|
||||||
|
|
||||||
|
tokens = make_posts_list("train/in.tsv")
|
||||||
|
Y = make_exp_list("train/out.tsv")
|
||||||
|
model = gensim.models.Word2Vec(tokens, size=300, min_count=1, workers=4)
|
||||||
|
print("\n Training the word2vec model...\n")
|
||||||
|
# reducing the epochs will decrease the computation time
|
||||||
|
model.train(tokens, total_examples=len(tokens), epochs=4000)
|
||||||
|
|
||||||
|
clf = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial').fit(model.wv.syn0, Y)
|
||||||
|
|
||||||
|
|
||||||
|
tokens = make_posts_list("dev-0/in.tsv")
|
||||||
|
# Prediction of the first 15 samples of all features
|
||||||
|
predict = clf.predict(model.wv.syn0[:15, :])
|
||||||
|
# Calculating the score of the predictions
|
||||||
|
score = clf.score(model.wv.syn0, Y_dataset[:max_dataset_size])
|
||||||
|
print("\nPrediction word2vec : \n", predict)
|
||||||
|
print("Score word2vec : \n", score)
|
Loading…
Reference in New Issue
Block a user