133 lines
2.9 KiB
Python
133 lines
2.9 KiB
Python
from multiprocessing.reduction import steal_handle
|
|
import os
|
|
import numpy as np
|
|
import nltk.data
|
|
# nltk.download('punkt')
|
|
|
|
male_words1 = [
|
|
'silnik',
|
|
'windows',
|
|
'gb',
|
|
'mb',
|
|
'meczu',
|
|
'pc',
|
|
'opony',
|
|
'apple',
|
|
'iphone',
|
|
'zwiastuny',
|
|
'hd',
|
|
'ubuntu',
|
|
'systemu',
|
|
'serwer',
|
|
"www.youtube.com",
|
|
"www.sfd.pl",
|
|
"www.wykop.pl",
|
|
"www.kfd.pl",
|
|
"www.elektroda.pl",
|
|
"www.autocentrum.pl",
|
|
"www.dobreprogramy.pl",
|
|
"flaker.pl",
|
|
"www.myapple.pl",
|
|
"youtube",
|
|
"sfd",
|
|
"kfd",
|
|
"elektroda",
|
|
"autocentrum",
|
|
"dobreprogramy",
|
|
]
|
|
|
|
|
|
|
|
female_words1 = [
|
|
'ciąży',
|
|
'miesiączki',
|
|
'ciasto',
|
|
'ciążę',
|
|
'zadowolona',
|
|
'ciąża',
|
|
'ciazy',
|
|
'antykoncepcyjne',
|
|
'ginekologa',
|
|
'tabletki',
|
|
'porodzie',
|
|
'mąż',
|
|
'miesiączkę',
|
|
'krwawienie',
|
|
'ciasta',
|
|
'gwiazdunie.pl',
|
|
'www.photoblog.pl',
|
|
'szafa.pl',
|
|
'www.kotek.pl',
|
|
'parenting.pl',
|
|
'www.forum-turystyczne.pl',
|
|
"www.babyboom.pl",
|
|
"tematy.abcciaza.pl",
|
|
"gwiazdunie",
|
|
"photoblog",
|
|
"kotek",
|
|
"babyboom",
|
|
"<3"
|
|
]
|
|
|
|
def wordsStem(words):
|
|
result = []
|
|
for word in words:
|
|
result.append(word[:6])
|
|
return result
|
|
|
|
#def sentenceLenClassificator(text):
|
|
# threshold = 6
|
|
# sentences = tokenizer.tokenize(text)
|
|
# avgSentLen = 0.0
|
|
# for sentence in sentences:
|
|
# avgSentLen += len(sentence)
|
|
# sentences_len = len(sentences)
|
|
# if sentences_len == 0: return 1
|
|
# avgSentLen = avgSentLen/sentences_len
|
|
# if avgSentLen > threshold: return 0
|
|
# else: return 1
|
|
|
|
def imbalanceWordsClassificator(text):
|
|
splitSentence = text.split()
|
|
countMale = 0
|
|
countFemale = 0
|
|
for word in splitSentence:
|
|
# if word.lower() in male_words1: result + biasVal
|
|
# elif word.lower() in female_words1: result - biasVal
|
|
if word.lower()[:6] in male_words1: countMale += 1
|
|
elif word.lower()[:6] in female_words1: countFemale += 1
|
|
# normalize result
|
|
# if result < 0.5: return 0
|
|
# else: return 1
|
|
if countMale >= countFemale: return 1
|
|
elif countMale < countFemale: return 0
|
|
# else: return sentenceLenClassificator(text)
|
|
# len classificator
|
|
# return sentenceLenClassificator(text)
|
|
|
|
male_words1 = wordsStem(male_words1)
|
|
female_words1 = wordsStem(female_words1)
|
|
|
|
# tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
|
|
|
|
result = []
|
|
|
|
# input_path = './train/in.tsv'
|
|
# output_path = './result.txt'
|
|
input_path = './test-A/in.tsv'
|
|
output_path = './expected-gonito.tsv'
|
|
|
|
with open(input_path, 'r', encoding="utf8") as trainFile:
|
|
Lines = trainFile.readlines()
|
|
|
|
for line in Lines:
|
|
result.append(imbalanceWordsClassificator(line))
|
|
# result.append(sentenceLenClassificator(line))
|
|
|
|
|
|
with open(output_path, mode='wt', encoding='utf-8') as myfile:
|
|
myfile.write('\n'.join(str(line) for line in result))
|
|
|
|
# os.system("eval.py")
|
|
print("-------end-------")
|