result and scripts
This commit is contained in:
parent
85a53fe007
commit
28e592072c
33
eval.py
Normal file
33
eval.py
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
import csv
|
||||||
|
|
||||||
|
def flatten(t):
|
||||||
|
return [item for sublist in t for item in sublist]
|
||||||
|
|
||||||
|
with open('./result.txt', mode='r', encoding='utf-8') as result:
|
||||||
|
with open('./train/expected.tsv', 'r', encoding="utf8") as expected:
|
||||||
|
|
||||||
|
result_vector = []
|
||||||
|
expected_vector = []
|
||||||
|
|
||||||
|
tsv_result = csv.reader(result, delimiter="\t")
|
||||||
|
for line in tsv_result:
|
||||||
|
result_vector.append(line)
|
||||||
|
|
||||||
|
tsv_expected = csv.reader(expected, delimiter="\t")
|
||||||
|
for line in tsv_expected:
|
||||||
|
expected_vector.append(line)
|
||||||
|
|
||||||
|
|
||||||
|
expected_vector = flatten(expected_vector)
|
||||||
|
result_vector = flatten(result_vector)
|
||||||
|
|
||||||
|
resultZip = zip(result_vector, expected_vector)
|
||||||
|
|
||||||
|
matchNumber = 0.0
|
||||||
|
for x,y in resultZip:
|
||||||
|
if x == y:
|
||||||
|
matchNumber += 1.0
|
||||||
|
|
||||||
|
print(matchNumber/len(expected_vector))
|
||||||
|
|
||||||
|
|
132
run.py
Normal file
132
run.py
Normal file
@ -0,0 +1,132 @@
|
|||||||
|
from multiprocessing.reduction import steal_handle
|
||||||
|
import os
|
||||||
|
import numpy as np
|
||||||
|
import nltk.data
|
||||||
|
# nltk.download('punkt')
|
||||||
|
|
||||||
|
male_words1 = [
|
||||||
|
'silnik',
|
||||||
|
'windows',
|
||||||
|
'gb',
|
||||||
|
'mb',
|
||||||
|
'meczu',
|
||||||
|
'pc',
|
||||||
|
'opony',
|
||||||
|
'apple',
|
||||||
|
'iphone',
|
||||||
|
'zwiastuny',
|
||||||
|
'hd',
|
||||||
|
'ubuntu',
|
||||||
|
'systemu',
|
||||||
|
'serwer',
|
||||||
|
"www.youtube.com",
|
||||||
|
"www.sfd.pl",
|
||||||
|
"www.wykop.pl",
|
||||||
|
"www.kfd.pl",
|
||||||
|
"www.elektroda.pl",
|
||||||
|
"www.autocentrum.pl",
|
||||||
|
"www.dobreprogramy.pl",
|
||||||
|
"flaker.pl",
|
||||||
|
"www.myapple.pl",
|
||||||
|
"youtube",
|
||||||
|
"sfd",
|
||||||
|
"kfd",
|
||||||
|
"elektroda",
|
||||||
|
"autocentrum",
|
||||||
|
"dobreprogramy",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
female_words1 = [
|
||||||
|
'ciąży',
|
||||||
|
'miesiączki',
|
||||||
|
'ciasto',
|
||||||
|
'ciążę',
|
||||||
|
'zadowolona',
|
||||||
|
'ciąża',
|
||||||
|
'ciazy',
|
||||||
|
'antykoncepcyjne',
|
||||||
|
'ginekologa',
|
||||||
|
'tabletki',
|
||||||
|
'porodzie',
|
||||||
|
'mąż',
|
||||||
|
'miesiączkę',
|
||||||
|
'krwawienie',
|
||||||
|
'ciasta',
|
||||||
|
'gwiazdunie.pl',
|
||||||
|
'www.photoblog.pl',
|
||||||
|
'szafa.pl',
|
||||||
|
'www.kotek.pl',
|
||||||
|
'parenting.pl',
|
||||||
|
'www.forum-turystyczne.pl',
|
||||||
|
"www.babyboom.pl",
|
||||||
|
"tematy.abcciaza.pl",
|
||||||
|
"gwiazdunie",
|
||||||
|
"photoblog",
|
||||||
|
"kotek",
|
||||||
|
"babyboom",
|
||||||
|
"<3"
|
||||||
|
]
|
||||||
|
|
||||||
|
def wordsStem(words):
|
||||||
|
result = []
|
||||||
|
for word in words:
|
||||||
|
result.append(word[:6])
|
||||||
|
return result
|
||||||
|
|
||||||
|
#def sentenceLenClassificator(text):
|
||||||
|
# threshold = 6
|
||||||
|
# sentences = tokenizer.tokenize(text)
|
||||||
|
# avgSentLen = 0.0
|
||||||
|
# for sentence in sentences:
|
||||||
|
# avgSentLen += len(sentence)
|
||||||
|
# sentences_len = len(sentences)
|
||||||
|
# if sentences_len == 0: return 1
|
||||||
|
# avgSentLen = avgSentLen/sentences_len
|
||||||
|
# if avgSentLen > threshold: return 0
|
||||||
|
# else: return 1
|
||||||
|
|
||||||
|
def imbalanceWordsClassificator(text):
|
||||||
|
splitSentence = text.split()
|
||||||
|
countMale = 0
|
||||||
|
countFemale = 0
|
||||||
|
for word in splitSentence:
|
||||||
|
# if word.lower() in male_words1: result + biasVal
|
||||||
|
# elif word.lower() in female_words1: result - biasVal
|
||||||
|
if word.lower()[:6] in male_words1: countMale += 1
|
||||||
|
elif word.lower()[:6] in female_words1: countFemale += 1
|
||||||
|
# normalize result
|
||||||
|
# if result < 0.5: return 0
|
||||||
|
# else: return 1
|
||||||
|
if countMale >= countFemale: return 1
|
||||||
|
elif countMale < countFemale: return 0
|
||||||
|
# else: return sentenceLenClassificator(text)
|
||||||
|
# len classificator
|
||||||
|
# return sentenceLenClassificator(text)
|
||||||
|
|
||||||
|
male_words1 = wordsStem(male_words1)
|
||||||
|
female_words1 = wordsStem(female_words1)
|
||||||
|
|
||||||
|
# tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
|
||||||
|
|
||||||
|
result = []
|
||||||
|
|
||||||
|
# input_path = './train/in.tsv'
|
||||||
|
# output_path = './result.txt'
|
||||||
|
input_path = './test-A/in.tsv'
|
||||||
|
output_path = './expected-gonito.tsv'
|
||||||
|
|
||||||
|
with open(input_path, 'r', encoding="utf8") as trainFile:
|
||||||
|
Lines = trainFile.readlines()
|
||||||
|
|
||||||
|
for line in Lines:
|
||||||
|
result.append(imbalanceWordsClassificator(line))
|
||||||
|
# result.append(sentenceLenClassificator(line))
|
||||||
|
|
||||||
|
|
||||||
|
with open(output_path, mode='wt', encoding='utf-8') as myfile:
|
||||||
|
myfile.write('\n'.join(str(line) for line in result))
|
||||||
|
|
||||||
|
# os.system("eval.py")
|
||||||
|
print("-------end-------")
|
134618
test-A/out.tsv
Normal file
134618
test-A/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user