dumb wordlist lookup
This commit is contained in:
parent
b775a221e6
commit
f8596ec169
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,50 @@
|
|||
import re
|
||||
|
||||
import csv
|
||||
import random
|
||||
import sys
|
||||
|
||||
file1 = sys.argv[1]
|
||||
file2 = sys.argv[2]
|
||||
|
||||
|
||||
def clean(text):
|
||||
text = text.lower()
|
||||
text = re.sub(' +', ' ', text)
|
||||
text = re.sub('[^aąbcćdeęfghijklłmnńoópqrsśtuvwxyzżźAĄBCĆDEĘFGHIJKLŁMNŃOÓPQRSŚTUVWXYZŻŹ ]+', '', text)
|
||||
return text
|
||||
|
||||
male_words = ['silnik', 'windows', 'silnika', 'gb', 'mb', 'meczu', 'pc', 'opony', 'apple', 'iphone', 'zwiastuny',
|
||||
'hd', 'ubuntu', 'systemu', 'serwer', 'mecz', 'procesor', 'system', 'żona', 'żony', 'żonę', 'piwo', 'piwa', 'piłka'
|
||||
'piłkę', 'samochód', 'samochodu', 'samochodem', 'rower', 'roweru', 'rowerem', 'gra', 'gry', 'grę', 'grą', 'grami']
|
||||
female_words = ['ciąży', 'miesiączki', 'ciasto', 'ciążę', 'ciąża', 'ciąży', 'antykoncepcyjne', 'ginekologa', 'tabletki',
|
||||
'porodzie', 'mąż', 'męża', 'miesiączkę', 'krwawienie', 'ciasta', 'makijaż', 'makijażu', 'makijażem', 'fryzura',
|
||||
'fryzurę',
|
||||
'fryzury', 'dieta', 'dietę', 'diety', 'miesiączka', 'dziecko', 'dziecka', 'dziecku', 'przystojny', 'przystojnego', 'przystojnemu']
|
||||
|
||||
predict = []
|
||||
|
||||
with open(file1, encoding='utf-8') as input_file, open('out.tsv', 'w', encoding='utf-8') as output_file:
|
||||
data = csv.reader((line.replace('\0', '') for line in input_file), delimiter='\t')
|
||||
for row in input_file:
|
||||
if len(row) != 0:
|
||||
row = clean(row)
|
||||
if any(word in female_words for word in row.split()):
|
||||
output_file.write('0\n')
|
||||
predict.append(0)
|
||||
elif any(word in male_words for word in row.split()):
|
||||
output_file.write('1\n')
|
||||
predict.append(1)
|
||||
else:
|
||||
output_file.write(str(round(random.random())) + '\n')
|
||||
predict.append(round(random.random()))
|
||||
|
||||
'''
|
||||
correct = []
|
||||
|
||||
with open(file2, encoding='utf-8') as exp_file:
|
||||
for row in exp_file:
|
||||
correct.append(int(row.strip()))
|
||||
|
||||
print(sum((1 if i == j else 0 for i, j in zip(predict, correct))) / len(correct))
|
||||
'''
|
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue