result and scripts
This commit is contained in:
parent
85a53fe007
commit
28e592072c
33
eval.py
Normal file
33
eval.py
Normal file
@ -0,0 +1,33 @@
|
||||
import csv
|
||||
|
||||
def flatten(t):
|
||||
return [item for sublist in t for item in sublist]
|
||||
|
||||
with open('./result.txt', mode='r', encoding='utf-8') as result:
|
||||
with open('./train/expected.tsv', 'r', encoding="utf8") as expected:
|
||||
|
||||
result_vector = []
|
||||
expected_vector = []
|
||||
|
||||
tsv_result = csv.reader(result, delimiter="\t")
|
||||
for line in tsv_result:
|
||||
result_vector.append(line)
|
||||
|
||||
tsv_expected = csv.reader(expected, delimiter="\t")
|
||||
for line in tsv_expected:
|
||||
expected_vector.append(line)
|
||||
|
||||
|
||||
expected_vector = flatten(expected_vector)
|
||||
result_vector = flatten(result_vector)
|
||||
|
||||
resultZip = zip(result_vector, expected_vector)
|
||||
|
||||
matchNumber = 0.0
|
||||
for x,y in resultZip:
|
||||
if x == y:
|
||||
matchNumber += 1.0
|
||||
|
||||
print(matchNumber/len(expected_vector))
|
||||
|
||||
|
132
run.py
Normal file
132
run.py
Normal file
@ -0,0 +1,132 @@
|
||||
from multiprocessing.reduction import steal_handle
|
||||
import os
|
||||
import numpy as np
|
||||
import nltk.data
|
||||
# nltk.download('punkt')
|
||||
|
||||
male_words1 = [
|
||||
'silnik',
|
||||
'windows',
|
||||
'gb',
|
||||
'mb',
|
||||
'meczu',
|
||||
'pc',
|
||||
'opony',
|
||||
'apple',
|
||||
'iphone',
|
||||
'zwiastuny',
|
||||
'hd',
|
||||
'ubuntu',
|
||||
'systemu',
|
||||
'serwer',
|
||||
"www.youtube.com",
|
||||
"www.sfd.pl",
|
||||
"www.wykop.pl",
|
||||
"www.kfd.pl",
|
||||
"www.elektroda.pl",
|
||||
"www.autocentrum.pl",
|
||||
"www.dobreprogramy.pl",
|
||||
"flaker.pl",
|
||||
"www.myapple.pl",
|
||||
"youtube",
|
||||
"sfd",
|
||||
"kfd",
|
||||
"elektroda",
|
||||
"autocentrum",
|
||||
"dobreprogramy",
|
||||
]
|
||||
|
||||
|
||||
|
||||
female_words1 = [
|
||||
'ciąży',
|
||||
'miesiączki',
|
||||
'ciasto',
|
||||
'ciążę',
|
||||
'zadowolona',
|
||||
'ciąża',
|
||||
'ciazy',
|
||||
'antykoncepcyjne',
|
||||
'ginekologa',
|
||||
'tabletki',
|
||||
'porodzie',
|
||||
'mąż',
|
||||
'miesiączkę',
|
||||
'krwawienie',
|
||||
'ciasta',
|
||||
'gwiazdunie.pl',
|
||||
'www.photoblog.pl',
|
||||
'szafa.pl',
|
||||
'www.kotek.pl',
|
||||
'parenting.pl',
|
||||
'www.forum-turystyczne.pl',
|
||||
"www.babyboom.pl",
|
||||
"tematy.abcciaza.pl",
|
||||
"gwiazdunie",
|
||||
"photoblog",
|
||||
"kotek",
|
||||
"babyboom",
|
||||
"<3"
|
||||
]
|
||||
|
||||
def wordsStem(words):
|
||||
result = []
|
||||
for word in words:
|
||||
result.append(word[:6])
|
||||
return result
|
||||
|
||||
#def sentenceLenClassificator(text):
|
||||
# threshold = 6
|
||||
# sentences = tokenizer.tokenize(text)
|
||||
# avgSentLen = 0.0
|
||||
# for sentence in sentences:
|
||||
# avgSentLen += len(sentence)
|
||||
# sentences_len = len(sentences)
|
||||
# if sentences_len == 0: return 1
|
||||
# avgSentLen = avgSentLen/sentences_len
|
||||
# if avgSentLen > threshold: return 0
|
||||
# else: return 1
|
||||
|
||||
def imbalanceWordsClassificator(text):
|
||||
splitSentence = text.split()
|
||||
countMale = 0
|
||||
countFemale = 0
|
||||
for word in splitSentence:
|
||||
# if word.lower() in male_words1: result + biasVal
|
||||
# elif word.lower() in female_words1: result - biasVal
|
||||
if word.lower()[:6] in male_words1: countMale += 1
|
||||
elif word.lower()[:6] in female_words1: countFemale += 1
|
||||
# normalize result
|
||||
# if result < 0.5: return 0
|
||||
# else: return 1
|
||||
if countMale >= countFemale: return 1
|
||||
elif countMale < countFemale: return 0
|
||||
# else: return sentenceLenClassificator(text)
|
||||
# len classificator
|
||||
# return sentenceLenClassificator(text)
|
||||
|
||||
male_words1 = wordsStem(male_words1)
|
||||
female_words1 = wordsStem(female_words1)
|
||||
|
||||
# tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
|
||||
|
||||
result = []
|
||||
|
||||
# input_path = './train/in.tsv'
|
||||
# output_path = './result.txt'
|
||||
input_path = './test-A/in.tsv'
|
||||
output_path = './expected-gonito.tsv'
|
||||
|
||||
with open(input_path, 'r', encoding="utf8") as trainFile:
|
||||
Lines = trainFile.readlines()
|
||||
|
||||
for line in Lines:
|
||||
result.append(imbalanceWordsClassificator(line))
|
||||
# result.append(sentenceLenClassificator(line))
|
||||
|
||||
|
||||
with open(output_path, mode='wt', encoding='utf-8') as myfile:
|
||||
myfile.write('\n'.join(str(line) for line in result))
|
||||
|
||||
# os.system("eval.py")
|
||||
print("-------end-------")
|
134618
test-A/out.tsv
Normal file
134618
test-A/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user