Compare commits

..

No commits in common. "master" and "ISI-13" have entirely different histories.

19 changed files with 305528 additions and 348702 deletions

5
.gitignore vendored
View File

@ -1,8 +1,11 @@
in.tsv
model.pkl
*~ *~
*.swp *.swp
*.bak *.bak
*.pyc *.pyc
*.o *.o
.DS_Store .DS_Store
.token .token
.idea

Binary file not shown.

View File

@ -5,7 +5,7 @@ Classify a reddit as either from Skeptic subreddit or one of the
"paranormal" subreddits (Paranormal, UFOs, TheTruthIsHere, Ghosts, "paranormal" subreddits (Paranormal, UFOs, TheTruthIsHere, Ghosts,
,Glitch-in-the-Matrix, conspiracytheories). ,Glitch-in-the-Matrix, conspiracytheories).
Output label is `S` and `P`. Output label is 0 (for skeptic) and 1 (for paranormal).
Sources Sources
------- -------

View File

@ -1 +1 @@
--metric Accuracy --precision 4 --metric Accuracy --metric F1 --metric F0:N<Precision> --metric F9999999:N<Recall> --precision 4 --in-header in-header.tsv --out-header out-header.tsv

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

12
mytokenize.py Normal file
View File

@ -0,0 +1,12 @@
import re
"""
Takes a document and returns a list of tokens.
"""
def tokenize(d):
d = re.sub(r'(\s+|\\n)', ' ', d)
d = re.sub(r'(https?:|www)\S+(\s+|$)', ' URL ', d)
d = d.lower().replace(".", " .").replace(",", " ,").replace("?", " ?").replace("!", " !")
d = re.sub(r'\d+', 'NUM', d)
return re.split(r'\s+', d)

13
predict.py Normal file
View File

@ -0,0 +1,13 @@
def predict(weights, word_to_index, tokenized_text):
ypred = weights[0] # bias or w0
for x in tokenized_text:
if x in word_to_index:
index = word_to_index[x]
ypred += weights[index] * 1
return ypred

20
predict_dev0_testA.py Normal file
View File

@ -0,0 +1,20 @@
import pickle
from predict import predict
from mytokenize import tokenize
import sys
path_pkl ="BestLinearRegressionModel.pkl"
model = pickle.load(open(path_pkl, "rb"))
word_to_index, weights, best_MSE = model
for line in sys.stdin:
text, date = line.split("\t")
tokenized_text = tokenize(text)
ypred = predict(weights, word_to_index, tokenized_text)
if ypred < 0.5:
print("0")
else:
print("1")

View File

@ -0,0 +1,83 @@
vocabulary = .... zbiór wszystkich słów ...
word_to_index_mapping = {}
index_to_word_mapping = {}
ix = 1
for w in vocabulary:
word_to_index_mapping[w] = ix
index_to_word_mapping[ix] = w
ix += 1
#inicjalizacja
weights = []
for ix in xrange(0, len(vocabulary)+1):
weights[ix] = losowa wartość z przedziału (-0.01, 0.01)
Albo weights[0] = 2012.0
learning_rate = 0.000001
Loss_sum = 0.0
Loss_sum_counter = 0
while ....
d, y = losowy przykład ze zbioru uczący
# predykcja
y_hat = weights[0]
dla każdego słowa w z dokumentu d:
y_hat += weights[word_to_index_mapping[w]] * (liczba wystąpień w w d)
# funkcja kosztu
Loss = (y_hat y)**2.0
Loss_sum += Loss
If loss_counter % 1000 == 0:
print(loss_sum / 1000)
Loss_counter = 0
Loss_sum = 0.0
# uczenie - update
delta = (y_hat - y) * learning_rate
weights[0] = weights[0] - delta
for każdego słowa w z dokumenty d:
weights[word_to_index_mapping[w]] -= (liczba wystąpień w w d) * delta

1
score.txt Normal file
View File

@ -0,0 +1 @@
MSE: 0.16878347523946954

View File

@ -1,15 +0,0 @@
0.6920
0.6857
0.6969
0.6931
0.6927
0.6952
0.6969
0.6969
0.6959
0.6959
0.6965
0.6965
0.6965
0.6954
0.6965

View File

@ -1,20 +1,120 @@
import re import os
import sys
for line in sys.stdin: import pandas as pd
if re.search(r'UFO|paranormal|UFOs|video|night|house|saw|camera|lights|light|alien|aliens|ghost|object|dream|sky|room|ufo|craft|happened|sightings|footage|dreams|sleep', line): import pickle
print("P") import csv
from random import uniform
from predict import predict
from mytokenize import tokenize
#load data:
train = pd.read_csv("train/in.tsv", delimiter="\t", header=None, names=["text","date"], quoting=csv.QUOTE_NONE)
texts = train["text"]
y = pd.read_csv("train/expected.tsv", header=None, names=["isparanormal"])
y = list(y["isparanormal"])
print(y)
print(y[0])
tokenized_texts = []
word_to_index = {}
index_to_word = {}
word_count = {}
learning_rate = 0.000001
for doc in texts:
tokens = tokenize(doc)
tokenized_texts.append(tokens)
for token in tokens:
if token in word_count:
word_count[token] += 1
else:
word_count[token] = 1
#vocabulary sorted from the most frequent words to the least f. w.
vocabulary = sorted(word_count, key= word_count.get, reverse=True)
for w in vocabulary:
i = len(word_to_index) + 1
word_to_index[w] = i
index_to_word[i] = w
weights = []
for i in range(0, len(vocabulary) + 1):
weights.append(uniform(-0.01, 0.01))
best_MSE = 1800
path_pkl ="BestLinearRegressionModel.pkl"
if os.path.isfile(path_pkl):
model = pickle.load(open(path_pkl, "rb"))
word_to_index, weights, best_MSE = model
print("Unpickled best model.", "\tbest_MSE: ", best_MSE)
loss_sum = 0.0
loss_counter = 0
for_MSE_sum = 0.0
MSE = 0.0
counter = 0
inc_counter = 0
while(True):
for i in range(0, len(tokenized_texts)):
#@@ Obliczanie ypred - start
ypred = predict(weights, word_to_index, tokenized_texts[i])
#@@ Obliczanie ypred - end
#@@ Obliczanie kosztu
loss = (ypred - y[i]) ** 2.0
loss_sum += loss
for_MSE_sum += loss
loss_counter += 1
#@@ Obliczanie kosztu - end
if loss_counter == 1000:
#print(str(loss_sum/1000), "\t", str(MSE))
loss_counter = 0
loss_sum = 0.0
#@@ Uczenie - aktualizacja wag - start
delta = (ypred - y[i]) * learning_rate
weights[0] -= delta
for x in tokenized_texts[i]:
index = word_to_index[x]
weights[index] -= delta
#@@ Uczenie - aktualizacja wag - end
#@@ Opcjonalna stratefia poprawiająca trenowanie- start
temp_MSE = for_MSE_sum / len(tokenized_texts)
for_MSE_sum = 0.0
if best_MSE > temp_MSE:
best_MSE = temp_MSE
model = (word_to_index, weights, best_MSE)
pickle.dump(model, open("BestLinearRegressionModel.pkl", "wb"))
with open("score.txt", "w") as out:
out.write("MSE:\t")
out.write(str(best_MSE))
if temp_MSE > MSE:
counter += 1
else: else:
print("S") inc_counter += 1
if counter > 2:
learning_rate *= 0.1
counter = 0
if inc_counter > 4:
learning_rate /= 0.90
inc_counter = 0
#@@ Opcjonalna stratefia poprawiająca trenowanie- end
MSE = temp_MSE
print("MSE: " "\t", "%10.10f" % MSE, "\tLearningRate:\t", "%10.10f" % float(learning_rate))
"""
happened|sightings|footage|dreams|sleep|videos|experiences|weird|objects|flying|strange|ET|photo|moving|fake|sighting|door|ghosts|looks|bed|spirits|paralysis|pictures|glitch|shadow|picture|space|photos|looked|phenomena|contact|spirit|stories|phenomenon|window|ufos|haunted|lol|creepy|lanterns|dark|scared|cameras|balloon|seen|beings|disclosure|story
"""

View File

@ -1,4 +1,5 @@
xzcat dev-0/in.tsv.xz | python3 solution.py > dev-0/out.tsv xzcat dev-0/in.tsv.xz | python3 predict_dev0_testA.py > dev-0/out.tsv
xzcat test-A/in.tsv.xz | python3 solution.py > test-A/out.tsv xzcat test-A/in.tsv.xz | python3 predict_dev0_testA.py > test-A/out.tsv
geval -t dev-0 >>scores.txt
geval -t dev-0

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

View File

@ -1,4 +0,0 @@
xzcat in.tsv.xz | paste expected.tsv - | egrep -o '^ P.*'| egrep -o '[[:alpha:]]+' | sort > sortedP
xzcat in.tsv.xz | paste expected.tsv - | egrep -o '^ S.*'| egrep -o '[[:alpha:]]+' | sort > sortedS
comm -23 sortedP sortedS > PsetsubtractionS
cat PsetsubtractionS | uniq -c | sort -nr > PsetsubtractionS_counted.txt