Compare commits
No commits in common. "master" and "ISI-13" have entirely different histories.
5
.gitignore
vendored
5
.gitignore
vendored
@ -1,8 +1,11 @@
|
|||||||
|
in.tsv
|
||||||
|
model.pkl
|
||||||
*~
|
*~
|
||||||
*.swp
|
*.swp
|
||||||
*.bak
|
*.bak
|
||||||
*.pyc
|
*.pyc
|
||||||
*.o
|
*.o
|
||||||
|
|
||||||
.DS_Store
|
.DS_Store
|
||||||
.token
|
.token
|
||||||
|
.idea
|
BIN
BestLinearRegressionModel.pkl
Normal file
BIN
BestLinearRegressionModel.pkl
Normal file
Binary file not shown.
@ -5,7 +5,7 @@ Classify a reddit as either from Skeptic subreddit or one of the
|
|||||||
"paranormal" subreddits (Paranormal, UFOs, TheTruthIsHere, Ghosts,
|
"paranormal" subreddits (Paranormal, UFOs, TheTruthIsHere, Ghosts,
|
||||||
,Glitch-in-the-Matrix, conspiracytheories).
|
,Glitch-in-the-Matrix, conspiracytheories).
|
||||||
|
|
||||||
Output label is `S` and `P`.
|
Output label is 0 (for skeptic) and 1 (for paranormal).
|
||||||
|
|
||||||
Sources
|
Sources
|
||||||
-------
|
-------
|
||||||
|
@ -1 +1 @@
|
|||||||
--metric Accuracy --precision 4
|
--metric Accuracy --metric F1 --metric F0:N<Precision> --metric F9999999:N<Recall> --precision 4 --in-header in-header.tsv --out-header out-header.tsv
|
||||||
|
10544
dev-0/expected.tsv
10544
dev-0/expected.tsv
File diff suppressed because it is too large
Load Diff
10544
dev-0/out.tsv
10544
dev-0/out.tsv
File diff suppressed because it is too large
Load Diff
12
mytokenize.py
Normal file
12
mytokenize.py
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
import re
|
||||||
|
|
||||||
|
"""
|
||||||
|
Takes a document and returns a list of tokens.
|
||||||
|
"""
|
||||||
|
def tokenize(d):
|
||||||
|
d = re.sub(r'(\s+|\\n)', ' ', d)
|
||||||
|
d = re.sub(r'(https?:|www)\S+(\s+|$)', ' URL ', d)
|
||||||
|
d = d.lower().replace(".", " .").replace(",", " ,").replace("?", " ?").replace("!", " !")
|
||||||
|
d = re.sub(r'\d+', 'NUM', d)
|
||||||
|
|
||||||
|
return re.split(r'\s+', d)
|
13
predict.py
Normal file
13
predict.py
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def predict(weights, word_to_index, tokenized_text):
|
||||||
|
ypred = weights[0] # bias or w0
|
||||||
|
for x in tokenized_text:
|
||||||
|
if x in word_to_index:
|
||||||
|
index = word_to_index[x]
|
||||||
|
ypred += weights[index] * 1
|
||||||
|
return ypred
|
||||||
|
|
||||||
|
|
20
predict_dev0_testA.py
Normal file
20
predict_dev0_testA.py
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
import pickle
|
||||||
|
from predict import predict
|
||||||
|
from mytokenize import tokenize
|
||||||
|
import sys
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
path_pkl ="BestLinearRegressionModel.pkl"
|
||||||
|
model = pickle.load(open(path_pkl, "rb"))
|
||||||
|
word_to_index, weights, best_MSE = model
|
||||||
|
|
||||||
|
for line in sys.stdin:
|
||||||
|
text, date = line.split("\t")
|
||||||
|
tokenized_text = tokenize(text)
|
||||||
|
ypred = predict(weights, word_to_index, tokenized_text)
|
||||||
|
if ypred < 0.5:
|
||||||
|
print("0")
|
||||||
|
else:
|
||||||
|
print("1")
|
||||||
|
|
83
regresja liniowa pseudokod.txt
Normal file
83
regresja liniowa pseudokod.txt
Normal file
@ -0,0 +1,83 @@
|
|||||||
|
vocabulary = .... zbiór wszystkich słów ...
|
||||||
|
|
||||||
|
word_to_index_mapping = {}
|
||||||
|
|
||||||
|
index_to_word_mapping = {}
|
||||||
|
|
||||||
|
ix = 1
|
||||||
|
|
||||||
|
for w in vocabulary:
|
||||||
|
|
||||||
|
word_to_index_mapping[w] = ix
|
||||||
|
|
||||||
|
index_to_word_mapping[ix] = w
|
||||||
|
|
||||||
|
ix += 1
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#inicjalizacja
|
||||||
|
|
||||||
|
weights = []
|
||||||
|
|
||||||
|
for ix in xrange(0, len(vocabulary)+1):
|
||||||
|
|
||||||
|
weights[ix] = losowa wartość z przedziału (-0.01, 0.01)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Albo weights[0] = 2012.0
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
learning_rate = 0.000001
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Loss_sum = 0.0
|
||||||
|
|
||||||
|
Loss_sum_counter = 0
|
||||||
|
|
||||||
|
while ....
|
||||||
|
|
||||||
|
d, y = losowy przykład ze zbioru uczący
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# predykcja
|
||||||
|
|
||||||
|
y_hat = weights[0]
|
||||||
|
|
||||||
|
dla każdego słowa w z dokumentu d:
|
||||||
|
|
||||||
|
y_hat += weights[word_to_index_mapping[w]] * (liczba wystąpień w w d)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# funkcja kosztu
|
||||||
|
|
||||||
|
Loss = (y_hat – y)**2.0
|
||||||
|
|
||||||
|
Loss_sum += Loss
|
||||||
|
|
||||||
|
If loss_counter % 1000 == 0:
|
||||||
|
|
||||||
|
print(loss_sum / 1000)
|
||||||
|
|
||||||
|
Loss_counter = 0
|
||||||
|
|
||||||
|
Loss_sum = 0.0
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# uczenie - update
|
||||||
|
|
||||||
|
delta = (y_hat - y) * learning_rate
|
||||||
|
|
||||||
|
weights[0] = weights[0] - delta
|
||||||
|
|
||||||
|
for każdego słowa w z dokumenty d:
|
||||||
|
|
||||||
|
weights[word_to_index_mapping[w]] -= (liczba wystąpień w w d) * delta
|
||||||
|
|
||||||
|
|
15
scores.txt
15
scores.txt
@ -1,15 +0,0 @@
|
|||||||
0.6920
|
|
||||||
0.6857
|
|
||||||
0.6969
|
|
||||||
0.6931
|
|
||||||
0.6927
|
|
||||||
0.6952
|
|
||||||
0.6969
|
|
||||||
0.6969
|
|
||||||
0.6959
|
|
||||||
0.6959
|
|
||||||
0.6965
|
|
||||||
0.6965
|
|
||||||
0.6965
|
|
||||||
0.6954
|
|
||||||
0.6965
|
|
128
solution.py
128
solution.py
@ -1,20 +1,120 @@
|
|||||||
import re
|
import os
|
||||||
import sys
|
|
||||||
|
|
||||||
for line in sys.stdin:
|
import pandas as pd
|
||||||
if re.search(r'UFO|paranormal|UFOs|video|night|house|saw|camera|lights|light|alien|aliens|ghost|object|dream|sky|room|ufo|craft|happened|sightings|footage|dreams|sleep', line):
|
import pickle
|
||||||
print("P")
|
import csv
|
||||||
|
from random import uniform
|
||||||
|
|
||||||
|
from predict import predict
|
||||||
|
from mytokenize import tokenize
|
||||||
|
|
||||||
|
#load data:
|
||||||
|
train = pd.read_csv("train/in.tsv", delimiter="\t", header=None, names=["text","date"], quoting=csv.QUOTE_NONE)
|
||||||
|
texts = train["text"]
|
||||||
|
y = pd.read_csv("train/expected.tsv", header=None, names=["isparanormal"])
|
||||||
|
y = list(y["isparanormal"])
|
||||||
|
print(y)
|
||||||
|
print(y[0])
|
||||||
|
|
||||||
|
tokenized_texts = []
|
||||||
|
word_to_index = {}
|
||||||
|
index_to_word = {}
|
||||||
|
word_count = {}
|
||||||
|
learning_rate = 0.000001
|
||||||
|
|
||||||
|
for doc in texts:
|
||||||
|
tokens = tokenize(doc)
|
||||||
|
tokenized_texts.append(tokens)
|
||||||
|
for token in tokens:
|
||||||
|
if token in word_count:
|
||||||
|
word_count[token] += 1
|
||||||
|
else:
|
||||||
|
word_count[token] = 1
|
||||||
|
|
||||||
|
#vocabulary sorted from the most frequent words to the least f. w.
|
||||||
|
vocabulary = sorted(word_count, key= word_count.get, reverse=True)
|
||||||
|
for w in vocabulary:
|
||||||
|
i = len(word_to_index) + 1
|
||||||
|
word_to_index[w] = i
|
||||||
|
index_to_word[i] = w
|
||||||
|
|
||||||
|
weights = []
|
||||||
|
for i in range(0, len(vocabulary) + 1):
|
||||||
|
weights.append(uniform(-0.01, 0.01))
|
||||||
|
|
||||||
|
|
||||||
|
best_MSE = 1800
|
||||||
|
|
||||||
|
path_pkl ="BestLinearRegressionModel.pkl"
|
||||||
|
if os.path.isfile(path_pkl):
|
||||||
|
model = pickle.load(open(path_pkl, "rb"))
|
||||||
|
word_to_index, weights, best_MSE = model
|
||||||
|
print("Unpickled best model.", "\tbest_MSE: ", best_MSE)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
loss_sum = 0.0
|
||||||
|
loss_counter = 0
|
||||||
|
for_MSE_sum = 0.0
|
||||||
|
MSE = 0.0
|
||||||
|
counter = 0
|
||||||
|
inc_counter = 0
|
||||||
|
while(True):
|
||||||
|
for i in range(0, len(tokenized_texts)):
|
||||||
|
#@@ Obliczanie ypred - start
|
||||||
|
ypred = predict(weights, word_to_index, tokenized_texts[i])
|
||||||
|
#@@ Obliczanie ypred - end
|
||||||
|
|
||||||
|
|
||||||
|
#@@ Obliczanie kosztu
|
||||||
|
loss = (ypred - y[i]) ** 2.0
|
||||||
|
loss_sum += loss
|
||||||
|
for_MSE_sum += loss
|
||||||
|
loss_counter += 1
|
||||||
|
#@@ Obliczanie kosztu - end
|
||||||
|
|
||||||
|
|
||||||
|
if loss_counter == 1000:
|
||||||
|
#print(str(loss_sum/1000), "\t", str(MSE))
|
||||||
|
loss_counter = 0
|
||||||
|
loss_sum = 0.0
|
||||||
|
|
||||||
|
|
||||||
|
#@@ Uczenie - aktualizacja wag - start
|
||||||
|
delta = (ypred - y[i]) * learning_rate
|
||||||
|
weights[0] -= delta
|
||||||
|
for x in tokenized_texts[i]:
|
||||||
|
index = word_to_index[x]
|
||||||
|
weights[index] -= delta
|
||||||
|
#@@ Uczenie - aktualizacja wag - end
|
||||||
|
|
||||||
|
|
||||||
|
#@@ Opcjonalna stratefia poprawiająca trenowanie- start
|
||||||
|
temp_MSE = for_MSE_sum / len(tokenized_texts)
|
||||||
|
for_MSE_sum = 0.0
|
||||||
|
if best_MSE > temp_MSE:
|
||||||
|
best_MSE = temp_MSE
|
||||||
|
model = (word_to_index, weights, best_MSE)
|
||||||
|
pickle.dump(model, open("BestLinearRegressionModel.pkl", "wb"))
|
||||||
|
with open("score.txt", "w") as out:
|
||||||
|
out.write("MSE:\t")
|
||||||
|
out.write(str(best_MSE))
|
||||||
|
if temp_MSE > MSE:
|
||||||
|
counter += 1
|
||||||
else:
|
else:
|
||||||
print("S")
|
inc_counter += 1
|
||||||
|
|
||||||
|
if counter > 2:
|
||||||
|
learning_rate *= 0.1
|
||||||
|
counter = 0
|
||||||
|
|
||||||
|
if inc_counter > 4:
|
||||||
|
learning_rate /= 0.90
|
||||||
|
inc_counter = 0
|
||||||
|
#@@ Opcjonalna stratefia poprawiająca trenowanie- end
|
||||||
|
|
||||||
|
|
||||||
|
MSE = temp_MSE
|
||||||
|
print("MSE: " "\t", "%10.10f" % MSE, "\tLearningRate:\t", "%10.10f" % float(learning_rate))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
happened|sightings|footage|dreams|sleep|videos|experiences|weird|objects|flying|strange|ET|photo|moving|fake|sighting|door|ghosts|looks|bed|spirits|paralysis|pictures|glitch|shadow|picture|space|photos|looked|phenomena|contact|spirit|stories|phenomenon|window|ufos|haunted|lol|creepy|lanterns|dark|scared|cameras|balloon|seen|beings|disclosure|story
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
7
start.sh
7
start.sh
@ -1,4 +1,5 @@
|
|||||||
xzcat dev-0/in.tsv.xz | python3 solution.py > dev-0/out.tsv
|
xzcat dev-0/in.tsv.xz | python3 predict_dev0_testA.py > dev-0/out.tsv
|
||||||
|
|
||||||
xzcat test-A/in.tsv.xz | python3 solution.py > test-A/out.tsv
|
xzcat test-A/in.tsv.xz | python3 predict_dev0_testA.py > test-A/out.tsv
|
||||||
geval -t dev-0 >>scores.txt
|
|
||||||
|
geval -t dev-0
|
||||||
|
10304
test-A/out.tsv
10304
test-A/out.tsv
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
579158
train/expected.tsv
579158
train/expected.tsv
File diff suppressed because it is too large
Load Diff
@ -1,4 +0,0 @@
|
|||||||
xzcat in.tsv.xz | paste expected.tsv - | egrep -o '^ P.*'| egrep -o '[[:alpha:]]+' | sort > sortedP
|
|
||||||
xzcat in.tsv.xz | paste expected.tsv - | egrep -o '^ S.*'| egrep -o '[[:alpha:]]+' | sort > sortedS
|
|
||||||
comm -23 sortedP sortedS > PsetsubtractionS
|
|
||||||
cat PsetsubtractionS | uniq -c | sort -nr > PsetsubtractionS_counted.txt
|
|
Loading…
Reference in New Issue
Block a user