Transformer implementation

2024-06-03 12:24:54 +02:00 · 2024-06-03 12:24:54 +02:00 · 89030b0440
commit 89030b0440
parent 1397a7a5c2
4 changed files with 1069 additions and 0 deletions
--- a/Transformer.ipynb
+++ b/Transformer.ipynb
@ -0,0 +1,66 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Transformer"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Import bibliotek"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Wczytanie danych"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_data = pd.read_csv(\"train/train.tsv\", sep=\"\\t\", header=None, names=[\"y\", \"x\"])\n",
+    "test_A_data = pd.read_csv(\"test-A/in.tsv\", sep=\"\\t\", header=None, names=[\"x\"])\n",
+    "dev0_data = pd.read_csv(\"dev-0/in.tsv\", sep=\"\\t\", header=None, names=[\"x\"])"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/BIN
+++ b/BIN
--- a/skrypcik.py
+++ b/skrypcik.py
@ -0,0 +1,58 @@
+import pandas as pd
+
+
+def correct_labels(input_file, output_file):
+    df = pd.read_csv(input_file, sep="\t", names=["Text"])
+
+    corrected_lines = []
+
+    for line in df["Text"]:
+        tokens = line.split(" ")
+        corrected_tokens = []
+        previous_token = "O"
+
+        for token in tokens:
+            if (
+                token == "I-ORG"
+                and previous_token != "B-ORG"
+                and previous_token != "I-ORG"
+            ):
+                corrected_tokens.append("B-ORG")
+            elif (
+                token == "I-PER"
+                and previous_token != "B-PER"
+                and previous_token != "I-PER"
+            ):
+                corrected_tokens.append("B-PER")
+            elif (
+                token == "I-LOC"
+                and previous_token != "B-LOC"
+                and previous_token != "I-LOC"
+            ):
+                corrected_tokens.append("B-LOC")
+            elif (
+                token == "I-MISC"
+                and previous_token != "B-MISC"
+                and previous_token != "I-MISC"
+            ):
+                corrected_tokens.append("B-MISC")
+            else:
+                corrected_tokens.append(token)
+
+            previous_token = token
+
+        corrected_line = " ".join(corrected_tokens)
+        corrected_lines.append(corrected_line)
+
+    df["Text"] = corrected_lines
+    df.to_csv(output_file, sep="\t", index=False, header=False)
+
+
+input_file = "test-A/out.tsv"
+output_file = "test-A/out.tsv"
+correct_labels(input_file, output_file)
+
+
+input_file = "dev-0/out.tsv"
+output_file = "dev-0/out.tsv"
+correct_labels(input_file, output_file)
--- a/train/train.tsv
+++ b/train/train.tsv