transform into Prefect flow
This commit is contained in:
parent
1abd5290cd
commit
7400bc17e1
145
main.py
145
main.py
@ -6,64 +6,62 @@ import nltk
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
import nltk
|
import nltk
|
||||||
from nltk.tokenize import word_tokenize
|
from nltk.tokenize import word_tokenize
|
||||||
from nltk.corpus import stopwords
|
from nltk.corpus import stopwords
|
||||||
from nltk.stem import WordNetLemmatizer
|
from nltk.stem import WordNetLemmatizer
|
||||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
from sklearn.linear_model import LogisticRegression
|
from sklearn.linear_model import LogisticRegression
|
||||||
from sklearn.model_selection import train_test_split
|
from sklearn.model_selection import train_test_split
|
||||||
|
from prefect import task, Flow
|
||||||
|
from pandas import DataFrame
|
||||||
|
from typing import List
|
||||||
|
|
||||||
nltk.download('stopwords')
|
nltk.download('stopwords')
|
||||||
nltk.download('wordnet')
|
nltk.download('wordnet')
|
||||||
nltk.download('punkt')
|
nltk.download('punkt')
|
||||||
|
|
||||||
train = pd.read_csv('train.csv')
|
|
||||||
test = pd.read_csv('test.csv')
|
|
||||||
|
|
||||||
train.head()
|
|
||||||
train_count=train.count()
|
|
||||||
print(train_count)
|
|
||||||
print(train_count/train_count[0]*100)
|
|
||||||
train = train.drop(['keyword', 'location'], axis = 1)
|
|
||||||
|
|
||||||
test.head()
|
|
||||||
test.describe()
|
|
||||||
|
|
||||||
# remove urls, handles, and the hashtag from hashtags (taken from https://stackoverflow.com/questions/8376691/how-to-remove-hashtag-user-link-of-a-tweet-using-regular-expression)
|
# remove urls, handles, and the hashtag from hashtags (taken from https://stackoverflow.com/questions/8376691/how-to-remove-hashtag-user-link-of-a-tweet-using-regular-expression)
|
||||||
def remove_urls(text):
|
def remove_urls(text):
|
||||||
new_text = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",text).split())
|
new_text = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",text).split())
|
||||||
return new_text
|
return new_text
|
||||||
|
|
||||||
|
|
||||||
# make all text lowercase
|
# make all text lowercase
|
||||||
def text_lowercase(text):
|
def text_lowercase(text):
|
||||||
return text.lower()
|
return text.lower()
|
||||||
|
|
||||||
|
|
||||||
# remove numbers
|
# remove numbers
|
||||||
def remove_numbers(text):
|
def remove_numbers(text):
|
||||||
result = re.sub(r'\d+', '', text)
|
result = re.sub(r'\d+', '', text)
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
# remove punctuation
|
# remove punctuation
|
||||||
def remove_punctuation(text):
|
def remove_punctuation(text):
|
||||||
translator = str.maketrans('', '', string.punctuation)
|
translator = str.maketrans('', '', string.punctuation)
|
||||||
return text.translate(translator)
|
return text.translate(translator)
|
||||||
|
|
||||||
|
|
||||||
# tokenize
|
# tokenize
|
||||||
def tokenize(text):
|
def tokenize(text):
|
||||||
text = word_tokenize(text)
|
text = word_tokenize(text)
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
|
||||||
# remove stopwords
|
# remove stopwords
|
||||||
stop_words = set(stopwords.words('english'))
|
stop_words = set(stopwords.words('english'))
|
||||||
def remove_stopwords(text):
|
def remove_stopwords(text):
|
||||||
text = [i for i in text if not i in stop_words]
|
text = [i for i in text if not i in stop_words]
|
||||||
return text
|
return text
|
||||||
|
|
||||||
# lemmatize
|
# lemmatize
|
||||||
lemmatizer = WordNetLemmatizer()
|
lemmatizer = WordNetLemmatizer()
|
||||||
def lemmatize(text):
|
def lemmatize(text):
|
||||||
text = [lemmatizer.lemmatize(token) for token in text]
|
text = [lemmatizer.lemmatize(token) for token in text]
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
|
||||||
def preprocessing(text):
|
def preprocessing(text):
|
||||||
text = text_lowercase(text)
|
text = text_lowercase(text)
|
||||||
text = remove_urls(text)
|
text = remove_urls(text)
|
||||||
@ -76,41 +74,98 @@ def preprocessing(text):
|
|||||||
return text
|
return text
|
||||||
|
|
||||||
|
|
||||||
pp_text_train = [] # our preprocessed text column
|
@task
|
||||||
for text_data in train['text']:
|
def get_train_set() -> DataFrame:
|
||||||
pp_text_data = preprocessing(text_data)
|
train = pd.read_csv('train.csv')
|
||||||
pp_text_train.append(pp_text_data)
|
train = train.drop(['keyword', 'location'], axis=1)
|
||||||
train['pp_text'] = pp_text_train # add the preprocessed text as a column
|
return train
|
||||||
|
|
||||||
pp_text_test = [] # our preprocessed text column
|
|
||||||
for text_data in test['text']:
|
|
||||||
pp_text_data = preprocessing(text_data)
|
|
||||||
pp_text_test.append(pp_text_data)
|
|
||||||
test['pp_text'] = pp_text_test # add the preprocessed text as a column
|
|
||||||
|
|
||||||
train_text_data = list(train['pp_text'])
|
@task
|
||||||
test_text_data = list(test['pp_text'])
|
def get_test_set() -> DataFrame:
|
||||||
corpus = train_text_data + test_text_data
|
return pd.read_csv('test.csv')
|
||||||
|
|
||||||
tf=TfidfVectorizer()
|
|
||||||
# the vectorizer must be fit onto the entire corpus
|
|
||||||
fitted_vectorizer = tf.fit(corpus)
|
|
||||||
|
|
||||||
# train
|
@task
|
||||||
train_transform = fitted_vectorizer.transform(train['pp_text'])
|
def preprocess_train(train: DataFrame) -> DataFrame:
|
||||||
y = train['target']
|
pp_text_train = []
|
||||||
# test
|
for text_data in train['text']:
|
||||||
test_transform = fitted_vectorizer.transform(test['pp_text'])
|
pp_text_data = preprocessing(text_data)
|
||||||
|
pp_text_train.append(pp_text_data)
|
||||||
|
train['pp_text'] = pp_text_train
|
||||||
|
return train
|
||||||
|
|
||||||
X=train_transform
|
|
||||||
X_train, X_test, y_train, y_test = train_test_split(X, y)
|
|
||||||
|
|
||||||
scikit_log_reg = LogisticRegression()
|
@task
|
||||||
model=scikit_log_reg.fit(X_train, y_train)
|
def preprocess_test(test: DataFrame) -> DataFrame:
|
||||||
|
pp_text_test = []
|
||||||
|
for text_data in test['text']:
|
||||||
|
pp_text_data = preprocessing(text_data)
|
||||||
|
pp_text_test.append(pp_text_data)
|
||||||
|
test['pp_text'] = pp_text_test
|
||||||
|
return test
|
||||||
|
|
||||||
predictions = model.predict(X_test)
|
|
||||||
count = 0
|
@task
|
||||||
for guess, answer in zip(predictions, y_test):
|
def prepare_vectorizer(train_data: DataFrame, test_data: DataFrame) -> TfidfVectorizer:
|
||||||
if guess == answer:
|
train_text_data = list(train_data['pp_text'])
|
||||||
count += 1
|
test_text_data = list(test_data['pp_text'])
|
||||||
print(count/len(y_test))
|
corpus = train_text_data + test_text_data
|
||||||
|
tf = TfidfVectorizer()
|
||||||
|
fitted_vectorizer = tf.fit(corpus)
|
||||||
|
return fitted_vectorizer
|
||||||
|
|
||||||
|
|
||||||
|
@task
|
||||||
|
def transform_train(vectorizer: TfidfVectorizer, train_set: DataFrame) -> DataFrame:
|
||||||
|
return vectorizer.transform(train_set)
|
||||||
|
|
||||||
|
|
||||||
|
@task
|
||||||
|
def transform_test(vectorizer: TfidfVectorizer, test_set: DataFrame) -> DataFrame:
|
||||||
|
return vectorizer.transform(test_set)
|
||||||
|
|
||||||
|
|
||||||
|
@task
|
||||||
|
def split_test_set(X: DataFrame, Y: DataFrame) -> dict:
|
||||||
|
X_train, X_test, y_train, y_test = train_test_split(X, Y)
|
||||||
|
return {'X_train': X_train, 'X_test': X_test, 'y_train': y_train, 'y_test': y_test}
|
||||||
|
|
||||||
|
|
||||||
|
@task
|
||||||
|
def train_model(X: DataFrame, Y: DataFrame) -> LogisticRegression:
|
||||||
|
scikit_log_reg = LogisticRegression()
|
||||||
|
model = scikit_log_reg.fit(X, Y)
|
||||||
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
@task
|
||||||
|
def evaluate(model: LogisticRegression, X: DataFrame, Y: DataFrame) -> None:
|
||||||
|
predictions = model.predict(X)
|
||||||
|
count = 0
|
||||||
|
for guess, answer in zip(predictions, Y):
|
||||||
|
if guess == answer:
|
||||||
|
count += 1
|
||||||
|
print("> model score: ", count/len(Y))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
with Flow("My First Flow!") as flow:
|
||||||
|
train_data = get_train_set()
|
||||||
|
test_data = get_test_set()
|
||||||
|
|
||||||
|
train_data = preprocess_train(train_data)
|
||||||
|
test_data = preprocess_test(test_data)
|
||||||
|
|
||||||
|
vectorizer = prepare_vectorizer(train_data, test_data)
|
||||||
|
|
||||||
|
vectorized_train_data = transform_train(vectorizer, train_data['pp_text'])
|
||||||
|
vectorized_test_data = transform_test(vectorizer, train_data['pp_text'])
|
||||||
|
|
||||||
|
splitted_data = split_test_set(vectorized_train_data, train_data['target'])
|
||||||
|
model = train_model(splitted_data['X_train'], splitted_data['y_train'])
|
||||||
|
evaluate(model, splitted_data['X_test'], splitted_data['y_test'])
|
||||||
|
|
||||||
|
flow.validate()
|
||||||
|
# flow.visualize()
|
||||||
|
flow.run()
|
||||||
|
Loading…
Reference in New Issue
Block a user