Compare commits

...

No commits in common. "ISI-13" and "master" have entirely different histories.

19 changed files with 348702 additions and 305528 deletions

5
.gitignore vendored
View File

@ -1,11 +1,8 @@
in.tsv
model.pkl
*~
*.swp
*.bak
*.pyc
*.o
.DS_Store
.token
.idea

Binary file not shown.

View File

@ -5,7 +5,7 @@ Classify a reddit as either from Skeptic subreddit or one of the
"paranormal" subreddits (Paranormal, UFOs, TheTruthIsHere, Ghosts,
,Glitch-in-the-Matrix, conspiracytheories).
Output label is 0 (for skeptic) and 1 (for paranormal).
Output label is `S` and `P`.
Sources
-------

View File

@ -1 +1 @@
--metric Accuracy --metric F1 --metric F0:N<Precision> --metric F9999999:N<Recall> --precision 4 --in-header in-header.tsv --out-header out-header.tsv
--metric Accuracy --precision 4

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,12 +0,0 @@
import re
"""
Takes a document and returns a list of tokens.
"""
def tokenize(d):
d = re.sub(r'(\s+|\\n)', ' ', d)
d = re.sub(r'(https?:|www)\S+(\s+|$)', ' URL ', d)
d = d.lower().replace(".", " .").replace(",", " ,").replace("?", " ?").replace("!", " !")
d = re.sub(r'\d+', 'NUM', d)
return re.split(r'\s+', d)

View File

@ -1,13 +0,0 @@
def predict(weights, word_to_index, tokenized_text):
ypred = weights[0] # bias or w0
for x in tokenized_text:
if x in word_to_index:
index = word_to_index[x]
ypred += weights[index] * 1
return ypred

View File

@ -1,20 +0,0 @@
import pickle
from predict import predict
from mytokenize import tokenize
import sys
path_pkl ="BestLinearRegressionModel.pkl"
model = pickle.load(open(path_pkl, "rb"))
word_to_index, weights, best_MSE = model
for line in sys.stdin:
text, date = line.split("\t")
tokenized_text = tokenize(text)
ypred = predict(weights, word_to_index, tokenized_text)
if ypred < 0.5:
print("0")
else:
print("1")

View File

@ -1,83 +0,0 @@
vocabulary = .... zbiór wszystkich słów ...
word_to_index_mapping = {}
index_to_word_mapping = {}
ix = 1
for w in vocabulary:
word_to_index_mapping[w] = ix
index_to_word_mapping[ix] = w
ix += 1
#inicjalizacja
weights = []
for ix in xrange(0, len(vocabulary)+1):
weights[ix] = losowa wartość z przedziału (-0.01, 0.01)
Albo weights[0] = 2012.0
learning_rate = 0.000001
Loss_sum = 0.0
Loss_sum_counter = 0
while ....
d, y = losowy przykład ze zbioru uczący
# predykcja
y_hat = weights[0]
dla każdego słowa w z dokumentu d:
y_hat += weights[word_to_index_mapping[w]] * (liczba wystąpień w w d)
# funkcja kosztu
Loss = (y_hat y)**2.0
Loss_sum += Loss
If loss_counter % 1000 == 0:
print(loss_sum / 1000)
Loss_counter = 0
Loss_sum = 0.0
# uczenie - update
delta = (y_hat - y) * learning_rate
weights[0] = weights[0] - delta
for każdego słowa w z dokumenty d:
weights[word_to_index_mapping[w]] -= (liczba wystąpień w w d) * delta

View File

@ -1 +0,0 @@
MSE: 0.16878347523946954

15
scores.txt Normal file
View File

@ -0,0 +1,15 @@
0.6920
0.6857
0.6969
0.6931
0.6927
0.6952
0.6969
0.6969
0.6959
0.6959
0.6965
0.6965
0.6965
0.6954
0.6965

View File

@ -1,120 +1,20 @@
import os
import re
import sys
import pandas as pd
import pickle
import csv
from random import uniform
from predict import predict
from mytokenize import tokenize
#load data:
train = pd.read_csv("train/in.tsv", delimiter="\t", header=None, names=["text","date"], quoting=csv.QUOTE_NONE)
texts = train["text"]
y = pd.read_csv("train/expected.tsv", header=None, names=["isparanormal"])
y = list(y["isparanormal"])
print(y)
print(y[0])
tokenized_texts = []
word_to_index = {}
index_to_word = {}
word_count = {}
learning_rate = 0.000001
for doc in texts:
tokens = tokenize(doc)
tokenized_texts.append(tokens)
for token in tokens:
if token in word_count:
word_count[token] += 1
else:
word_count[token] = 1
#vocabulary sorted from the most frequent words to the least f. w.
vocabulary = sorted(word_count, key= word_count.get, reverse=True)
for w in vocabulary:
i = len(word_to_index) + 1
word_to_index[w] = i
index_to_word[i] = w
weights = []
for i in range(0, len(vocabulary) + 1):
weights.append(uniform(-0.01, 0.01))
best_MSE = 1800
path_pkl ="BestLinearRegressionModel.pkl"
if os.path.isfile(path_pkl):
model = pickle.load(open(path_pkl, "rb"))
word_to_index, weights, best_MSE = model
print("Unpickled best model.", "\tbest_MSE: ", best_MSE)
loss_sum = 0.0
loss_counter = 0
for_MSE_sum = 0.0
MSE = 0.0
counter = 0
inc_counter = 0
while(True):
for i in range(0, len(tokenized_texts)):
#@@ Obliczanie ypred - start
ypred = predict(weights, word_to_index, tokenized_texts[i])
#@@ Obliczanie ypred - end
#@@ Obliczanie kosztu
loss = (ypred - y[i]) ** 2.0
loss_sum += loss
for_MSE_sum += loss
loss_counter += 1
#@@ Obliczanie kosztu - end
if loss_counter == 1000:
#print(str(loss_sum/1000), "\t", str(MSE))
loss_counter = 0
loss_sum = 0.0
#@@ Uczenie - aktualizacja wag - start
delta = (ypred - y[i]) * learning_rate
weights[0] -= delta
for x in tokenized_texts[i]:
index = word_to_index[x]
weights[index] -= delta
#@@ Uczenie - aktualizacja wag - end
#@@ Opcjonalna stratefia poprawiająca trenowanie- start
temp_MSE = for_MSE_sum / len(tokenized_texts)
for_MSE_sum = 0.0
if best_MSE > temp_MSE:
best_MSE = temp_MSE
model = (word_to_index, weights, best_MSE)
pickle.dump(model, open("BestLinearRegressionModel.pkl", "wb"))
with open("score.txt", "w") as out:
out.write("MSE:\t")
out.write(str(best_MSE))
if temp_MSE > MSE:
counter += 1
for line in sys.stdin:
if re.search(r'UFO|paranormal|UFOs|video|night|house|saw|camera|lights|light|alien|aliens|ghost|object|dream|sky|room|ufo|craft|happened|sightings|footage|dreams|sleep', line):
print("P")
else:
inc_counter += 1
if counter > 2:
learning_rate *= 0.1
counter = 0
if inc_counter > 4:
learning_rate /= 0.90
inc_counter = 0
#@@ Opcjonalna stratefia poprawiająca trenowanie- end
print("S")
MSE = temp_MSE
print("MSE: " "\t", "%10.10f" % MSE, "\tLearningRate:\t", "%10.10f" % float(learning_rate))
"""
happened|sightings|footage|dreams|sleep|videos|experiences|weird|objects|flying|strange|ET|photo|moving|fake|sighting|door|ghosts|looks|bed|spirits|paralysis|pictures|glitch|shadow|picture|space|photos|looked|phenomena|contact|spirit|stories|phenomenon|window|ufos|haunted|lol|creepy|lanterns|dark|scared|cameras|balloon|seen|beings|disclosure|story
"""

View File

@ -1,5 +1,4 @@
xzcat dev-0/in.tsv.xz | python3 predict_dev0_testA.py > dev-0/out.tsv
xzcat dev-0/in.tsv.xz | python3 solution.py > dev-0/out.tsv
xzcat test-A/in.tsv.xz | python3 predict_dev0_testA.py > test-A/out.tsv
geval -t dev-0
xzcat test-A/in.tsv.xz | python3 solution.py > test-A/out.tsv
geval -t dev-0 >>scores.txt

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

1
train/PssS_c_clean Normal file

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

4
train/s.sh Executable file
View File

@ -0,0 +1,4 @@
xzcat in.tsv.xz | paste expected.tsv - | egrep -o '^ P.*'| egrep -o '[[:alpha:]]+' | sort > sortedP
xzcat in.tsv.xz | paste expected.tsv - | egrep -o '^ S.*'| egrep -o '[[:alpha:]]+' | sort > sortedS
comm -23 sortedP sortedS > PsetsubtractionS
cat PsetsubtractionS | uniq -c | sort -nr > PsetsubtractionS_counted.txt