paranormal-or-skeptic/.ipynb_checkpoints/Untitled-checkpoint.ipynb
2020-03-13 01:24:43 +01:00

11 KiB
Raw Permalink Blame History

import pandas as pd

train = pd.read_csv("train/in.tsv.xz",header=None, compression='xz',sep="\t", names=["text","time"])
expected = pd.read_csv("train/expected.tsv", header=None)
train["expected"] = expected
train[train["expected"]==' S']["text"].str.len().describe()
count    185478.000000
mean        303.405056
std         494.328936
min           3.000000
25%          68.000000
50%         151.000000
75%         341.000000
max       10251.000000
Name: text, dtype: float64
train[train["expected"]==' P']["text"].str.len().describe()
count    104063.000000
mean        298.150995
std         504.984133
min           3.000000
25%          65.000000
50%         146.000000
75%         330.000000
max       10161.000000
Name: text, dtype: float64
import string
from nltk import word_tokenize
import nltk
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english'))
nltk.download("punkt")

def clean_text(text):
    text = word_tokenize(text)
    text = [word.lower() for word in text if word.isalpha()]
    punct = str.maketrans('','',string.punctuation)
    text = [word.translate(punct) for word in text]
    text = [word for word in text if not word in stopwords]
    return text

train['text'] = train['text'].apply(clean_text)
[nltk_data] Downloading package punkt to /home/th3niko/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
train['text']
0                               [medical, issues, recently]
1         [supposedly, aluminum, barium, strontium, used...
2                               [nobel, prizes, make, rich]
3                           [came, article, stayed, doctor]
4         [resorted, insults, got, owned, directly, afte...
                                ...                        
289536    [really, baby, shampoo, actually, highly, alka...
289537    [gives, example, brendan, reilly, doctor, came...
289538                                 [ca, fix, stupidity]
289539    [excellent, points, also, looking, bit, progra...
289540         [earlier, year, may, couple, days, ago, nov]
Name: text, Length: 289541, dtype: object
from collections import Counter
def counter(text):
    cnt = Counter()
    for msgs in text:
        for msg in msgs:
            cnt[msg] += 1
    return cnt

text_cnt_s = counter(train[train['expected']==' S']['text'])
text_cnt_p = counter(train[train['expected']==' P']['text'])
text_s = text_cnt_s.most_common(100)
text_p = text_cnt_p.most_common(100)
text_s = pd.DataFrame(text_s,columns = ['words','counts'])
text_p = pd.DataFrame(text_p,columns = ['words','counts'])
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
concatenated = pd.concat([text_s.assign(dataset='s'), text_p.assign(dataset='p')])
concatenated
sns.set(style="whitegrid")
g = sns.catplot(x="words", y="counts", data=concatenated,
                height=6, kind="bar", palette="muted",style="dataset")
/home/th3niko/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:1: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.

  """Entry point for launching an IPython kernel.
counts1 counts2 dataset words1 words2
0 39094.0 NaN s would NaN
1 36978.0 NaN s like NaN
2 36461.0 NaN s people NaN
3 29143.0 NaN s one NaN
4 26827.0 NaN s think NaN
... ... ... ... ... ...
95 NaN 3007.0 p NaN kind
96 NaN 2990.0 p NaN show
97 NaN 2970.0 p NaN far
98 NaN 2964.0 p NaN feel
99 NaN 2915.0 p NaN try

200 rows × 5 columns