import pandas as pd # our main data management package import matplotlib.pyplot as plt # our main display package import string # used for preprocessing import re # used for preprocessing import nltk # the Natural Language Toolkit, used for preprocessing import numpy as np # used for managing NaNs import nltk from prefect import task, Flow, Parameter from nltk.tokenize import word_tokenize from nltk.corpus import stopwords # used for preprocessing from nltk.stem import WordNetLemmatizer # used for preprocessing from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import LogisticRegression # our model from sklearn.model_selection import train_test_split nltk.download('stopwords') nltk.download('wordnet') nltk.download('punkt') train = pd.read_csv('train.csv') train_count=train.count() train = train.drop(['keyword', 'location'], axis = 1) test = pd.read_csv('test.csv') stop_words = set(stopwords.words('english')) lemmatizer = WordNetLemmatizer() pp_text_train = [] # our preprocessed text column pp_text_test = [] # our preprocessed text column @task(log_stdout=True) def describe(file): file.describe() with Flow("Prepare") as prepare: file = Parameter('file') describe(file) prepare.run(file=train) prepare.run(file=test) # remove urls, handles, and the hashtag from hashtags (taken from https://stackoverflow.com/questions/8376691/how-to-remove-hashtag-user-link-of-a-tweet-using-regular-expression) def remove_urls(text): new_text = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",text).split()) return new_text # make all text lowercase def text_lowercase(text): return text.lower() # remove numbers def remove_numbers(text): result = re.sub(r'\d+', '', text) return result # remove punctuation def remove_punctuation(text): translator = str.maketrans('', '', string.punctuation) return text.translate(translator) # tokenize def tokenize(text): text = word_tokenize(text) return text # remove stopwords def remove_stopwords(text): text = [i for i in text if not i in stop_words] return text # lemmatize def lemmatize(text): text = [lemmatizer.lemmatize(token) for token in text] return text def preprocessing(text): text = text_lowercase(text) text = remove_urls(text) text = remove_numbers(text) text = remove_punctuation(text) text = tokenize(text) text = remove_stopwords(text) text = lemmatize(text) text = ' '.join(text) return text @task(log_stdout=True) def preprocess(file, pp_text): for text_data in file['text']: pp_text_data = preprocessing(text_data) pp_text.append(pp_text_data) file['pp_text'] = pp_text # add the preprocessed text as a column with Flow("Preprocess") as preprocessData: file = Parameter('file') pp_text = Parameter('pp_text') preprocess(file, pp_text) preprocessData.run(file=train, pp_text=pp_text_train) preprocessData.run(file=test, pp_text=pp_text_test) train_text_data = list(train['pp_text']) test_text_data = list(test['pp_text']) corpus = train_text_data + test_text_data tf=TfidfVectorizer() fitted_vectorizer = tf.fit(corpus) # train train_transform = fitted_vectorizer.transform(train['pp_text']) y = train['target'] # test test_transform = fitted_vectorizer.transform(test['pp_text']) X=train_transform print(X) X_train, X_test, y_train, y_test = train_test_split(X, y) scikit_log_reg = LogisticRegression() model=scikit_log_reg.fit(X_train, y_train) predictions = model.predict(X_test) count = 0 for guess, answer in zip(predictions, y_test): if guess == answer: count += 1 print(count/len(y_test))