add base project

2020-06-13 23:03:53 +02:00 · 2020-06-13 23:03:53 +02:00 · 1abd5290cd
commit 1abd5290cd
parent 80c5bdf9a1
2 changed files with 122 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -58,3 +58,9 @@ docs/_build/
 # PyBuilder
 target/

+# exclude in train and test csv files
+train.csv
+test.csv
+
+# virtualenv
+venv/
--- a/main.py
+++ b/main.py
@ -0,0 +1,116 @@
+import pandas as pd
+import matplotlib.pyplot as plt
+import string
+import re
+import nltk
+import numpy as np
+import nltk
+from nltk.tokenize import word_tokenize
+from nltk.corpus import stopwords 
+from nltk.stem import WordNetLemmatizer
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.linear_model import LogisticRegression 
+from sklearn.model_selection import train_test_split
+
+nltk.download('stopwords')
+nltk.download('wordnet')
+nltk.download('punkt')
+
+train = pd.read_csv('train.csv')
+test = pd.read_csv('test.csv')
+
+train.head()
+train_count=train.count()
+print(train_count)
+print(train_count/train_count[0]*100)
+train = train.drop(['keyword', 'location'], axis = 1)
+
+test.head()
+test.describe()
+
+# remove urls, handles, and the hashtag from hashtags (taken from https://stackoverflow.com/questions/8376691/how-to-remove-hashtag-user-link-of-a-tweet-using-regular-expression)
+def remove_urls(text):
+    new_text = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",text).split())
+    return new_text
+
+# make all text lowercase
+def text_lowercase(text):
+    return text.lower()
+
+# remove numbers
+def remove_numbers(text):
+    result = re.sub(r'\d+', '', text)
+    return result
+
+# remove punctuation
+def remove_punctuation(text):
+    translator = str.maketrans('', '', string.punctuation)
+    return text.translate(translator)
+
+# tokenize
+def tokenize(text):
+    text = word_tokenize(text)
+    return text
+
+# remove stopwords
+stop_words = set(stopwords.words('english'))
+def remove_stopwords(text):
+    text = [i for i in text if not i in stop_words]
+    return text
+    
+# lemmatize
+lemmatizer = WordNetLemmatizer()
+def lemmatize(text):
+    text = [lemmatizer.lemmatize(token) for token in text]
+    return text
+
+def preprocessing(text):
+    text = text_lowercase(text)
+    text = remove_urls(text)
+    text = remove_numbers(text)
+    text = remove_punctuation(text)
+    text = tokenize(text)
+    text = remove_stopwords(text)
+    text = lemmatize(text)
+    text = ' '.join(text)
+    return text
+
+
+pp_text_train = [] # our preprocessed text column
+for text_data in train['text']:
+    pp_text_data = preprocessing(text_data)
+    pp_text_train.append(pp_text_data)
+train['pp_text'] = pp_text_train # add the preprocessed text as a column
+
+pp_text_test = [] # our preprocessed text column
+for text_data in test['text']:
+    pp_text_data = preprocessing(text_data)
+    pp_text_test.append(pp_text_data)
+test['pp_text'] = pp_text_test # add the preprocessed text as a column
+
+train_text_data = list(train['pp_text'])
+test_text_data = list(test['pp_text'])
+corpus = train_text_data + test_text_data
+
+tf=TfidfVectorizer()
+# the vectorizer must be fit onto the entire corpus
+fitted_vectorizer = tf.fit(corpus)
+
+# train
+train_transform = fitted_vectorizer.transform(train['pp_text'])
+y = train['target']
+# test
+test_transform = fitted_vectorizer.transform(test['pp_text'])
+
+X=train_transform
+X_train, X_test, y_train, y_test = train_test_split(X, y)
+
+scikit_log_reg = LogisticRegression()
+model=scikit_log_reg.fit(X_train, y_train)
+
+predictions = model.predict(X_test)
+count = 0
+for guess, answer in zip(predictions, y_test):
+    if guess == answer:
+        count += 1
+print(count/len(y_test))