74 lines
2.4 KiB
Python
74 lines
2.4 KiB
Python
import pandas as pd
|
|
from joblib import load
|
|
import string
|
|
import re
|
|
import featuretools as ft
|
|
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, CountVectorizer
|
|
|
|
id_to_labels = load('labels.pkl')
|
|
data = open('testdata.txt').read().splitlines()
|
|
df = pd.DataFrame(data, columns=["body_text"])
|
|
df['index'] = df.index
|
|
columns_titles = ["index", "body_text"]
|
|
df=df.reindex(columns=columns_titles)
|
|
col = ['index','body_text']
|
|
df = df[col]
|
|
df.columns = ['index','body_text']
|
|
|
|
model = load('model.pkl')
|
|
|
|
def count_punct(text):
|
|
count = sum([1 for char in text if char in string.punctuation])
|
|
return round(count/(len(text) - text.count(" ")), 3)*100
|
|
|
|
df['body_len'] = df['body_text'].apply(lambda x: len(x) - x.count(" "))
|
|
df['punct%'] = df['body_text'].apply(lambda x: count_punct(x))
|
|
|
|
#es = ft.EntitySet(id="text_data")
|
|
#es = es.entity_from_dataframe(entity_id="data",
|
|
# index='index',
|
|
# dataframe=df)
|
|
#from nlp_primitives import (
|
|
# DiversityScore,
|
|
# LSA,
|
|
# MeanCharactersPerWord,
|
|
# TitleWordCount,
|
|
# UpperCaseCount)
|
|
|
|
|
|
#trans = [DiversityScore,
|
|
# MeanCharactersPerWord,
|
|
# TitleWordCount,
|
|
# LSA,
|
|
# UpperCaseCount]
|
|
#feature_matrix, feature_defs = ft.dfs(entityset=es,
|
|
# target_entity='data',
|
|
# verbose=True,
|
|
# trans_primitives=trans,
|
|
# max_depth=4)
|
|
#feature_matrix.drop(["body_len"], axis=1, inplace=True)
|
|
#feature_matrix.drop(["punct%"], axis=1, inplace=True)
|
|
|
|
|
|
# Vectorizing data
|
|
#def clean_text(text):
|
|
# text = "".join([word.lower() for word in text if word not in string.punctuation])
|
|
# tokens = re.split('\W+', text)
|
|
# text = [word for word in tokens]
|
|
# return text
|
|
transformer = TfidfTransformer()
|
|
loaded_vec = CountVectorizer(decode_error="replace",vocabulary=load('vocabulary.pkl'))
|
|
transformed = transformer.fit_transform(loaded_vec.fit_transform(df.body_text).toarray())
|
|
|
|
features = pd.concat([df[['body_len', 'punct%']].reset_index(drop=True),
|
|
pd.DataFrame(transformed.toarray()).reset_index(drop=True)], axis=1)
|
|
#dataset = pd.concat([features,feature_matrix.reset_index(drop=True)], axis=1, sort=False)
|
|
|
|
pred = model.predict(features)
|
|
labels = list(map(id_to_labels.get, pred))
|
|
df['label'] = labels
|
|
del df['body_len']
|
|
del df['punct%']
|
|
df.to_csv('result.csv', encoding='utf-8')
|
|
|