petite-difference-challenge2/run.py

133 lines
2.9 KiB
Python

from multiprocessing.reduction import steal_handle
import os
import numpy as np
import nltk.data
# nltk.download('punkt')
male_words1 = [
'silnik',
'windows',
'gb',
'mb',
'meczu',
'pc',
'opony',
'apple',
'iphone',
'zwiastuny',
'hd',
'ubuntu',
'systemu',
'serwer',
"www.youtube.com",
"www.sfd.pl",
"www.wykop.pl",
"www.kfd.pl",
"www.elektroda.pl",
"www.autocentrum.pl",
"www.dobreprogramy.pl",
"flaker.pl",
"www.myapple.pl",
"youtube",
"sfd",
"kfd",
"elektroda",
"autocentrum",
"dobreprogramy",
]
female_words1 = [
'ciąży',
'miesiączki',
'ciasto',
'ciążę',
'zadowolona',
'ciąża',
'ciazy',
'antykoncepcyjne',
'ginekologa',
'tabletki',
'porodzie',
'mąż',
'miesiączkę',
'krwawienie',
'ciasta',
'gwiazdunie.pl',
'www.photoblog.pl',
'szafa.pl',
'www.kotek.pl',
'parenting.pl',
'www.forum-turystyczne.pl',
"www.babyboom.pl",
"tematy.abcciaza.pl",
"gwiazdunie",
"photoblog",
"kotek",
"babyboom",
"<3"
]
def wordsStem(words):
result = []
for word in words:
result.append(word[:6])
return result
#def sentenceLenClassificator(text):
# threshold = 6
# sentences = tokenizer.tokenize(text)
# avgSentLen = 0.0
# for sentence in sentences:
# avgSentLen += len(sentence)
# sentences_len = len(sentences)
# if sentences_len == 0: return 1
# avgSentLen = avgSentLen/sentences_len
# if avgSentLen > threshold: return 0
# else: return 1
def imbalanceWordsClassificator(text):
splitSentence = text.split()
countMale = 0
countFemale = 0
for word in splitSentence:
# if word.lower() in male_words1: result + biasVal
# elif word.lower() in female_words1: result - biasVal
if word.lower()[:6] in male_words1: countMale += 1
elif word.lower()[:6] in female_words1: countFemale += 1
# normalize result
# if result < 0.5: return 0
# else: return 1
if countMale >= countFemale: return 1
elif countMale < countFemale: return 0
# else: return sentenceLenClassificator(text)
# len classificator
# return sentenceLenClassificator(text)
male_words1 = wordsStem(male_words1)
female_words1 = wordsStem(female_words1)
# tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
result = []
# input_path = './train/in.tsv'
# output_path = './result.txt'
input_path = './test-A/in.tsv'
output_path = './expected-gonito.tsv'
with open(input_path, 'r', encoding="utf8") as trainFile:
Lines = trainFile.readlines()
for line in Lines:
result.append(imbalanceWordsClassificator(line))
# result.append(sentenceLenClassificator(line))
with open(output_path, mode='wt', encoding='utf-8') as myfile:
myfile.write('\n'.join(str(line) for line in result))
# os.system("eval.py")
print("-------end-------")