3
This commit is contained in:
parent
a2064b7ed9
commit
2be2a96fe1
19454
dev-0/out.tsv
19454
dev-0/out.tsv
File diff suppressed because it is too large
Load Diff
23
run.py
23
run.py
@ -12,7 +12,6 @@ data = pd.read_csv(
|
|||||||
error_bad_lines=False,
|
error_bad_lines=False,
|
||||||
header=None,
|
header=None,
|
||||||
quoting=csv.QUOTE_NONE,
|
quoting=csv.QUOTE_NONE,
|
||||||
nrows=250000
|
|
||||||
)
|
)
|
||||||
train_labels = pd.read_csv(
|
train_labels = pd.read_csv(
|
||||||
"train/expected.tsv",
|
"train/expected.tsv",
|
||||||
@ -20,7 +19,6 @@ train_labels = pd.read_csv(
|
|||||||
error_bad_lines=False,
|
error_bad_lines=False,
|
||||||
header=None,
|
header=None,
|
||||||
quoting=csv.QUOTE_NONE,
|
quoting=csv.QUOTE_NONE,
|
||||||
nrows=250000
|
|
||||||
)
|
)
|
||||||
|
|
||||||
train_data = data[[6, 7]]
|
train_data = data[[6, 7]]
|
||||||
@ -33,22 +31,17 @@ model = defaultdict(lambda: defaultdict(lambda: 0))
|
|||||||
|
|
||||||
def clean(text):
|
def clean(text):
|
||||||
text = str(text)
|
text = str(text)
|
||||||
# normalize text
|
|
||||||
text = (
|
text = (
|
||||||
unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode(
|
unicodedata.normalize("NFKD", text)
|
||||||
'utf-8', 'ignore'))
|
.encode("ascii", "ignore")
|
||||||
# replace html chars with ' '
|
.decode("utf-8", "ignore")
|
||||||
text = re.sub('<.*?>', ' ', text)
|
)
|
||||||
# remove punctuation
|
text = re.sub("<.*?>", " ", text)
|
||||||
text = text.translate(str.maketrans(' ', ' ', string.punctuation))
|
text = text.translate(str.maketrans(" ", " ", string.punctuation))
|
||||||
# only alphabets and numerics
|
text = re.sub("[^a-zA-Z]", " ", text)
|
||||||
text = re.sub('[^a-zA-Z]', ' ', text)
|
|
||||||
# replace newline with space
|
|
||||||
text = re.sub("\n", " ", text)
|
text = re.sub("\n", " ", text)
|
||||||
# lower case
|
|
||||||
text = text.lower()
|
text = text.lower()
|
||||||
# split and join the words
|
text = " ".join(text.split())
|
||||||
text = ' '.join(text.split())
|
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
13608
test-A/out.tsv
13608
test-A/out.tsv
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user