added lab3

2024-04-15 22:09:49 +02:00 · 2024-04-15 22:09:49 +02:00 · 854b3629df
commit 854b3629df
parent e343070e32
1 changed files with 231 additions and 27 deletions
--- a/lab/lab_03.ipynb
+++ b/lab/lab_03.ipynb
@ -86,7 +86,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 55,
   "id": "loving-prince",
   "metadata": {},
   "outputs": [],
@ -100,6 +100,14 @@
    "text += \" Create a program that uses Swing components. Compile the program. Run the program.\""
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "05436dad",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
  {
   "cell_type": "markdown",
   "id": "extreme-cycling",
@ -110,12 +118,12 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 56,
   "id": "bound-auction",
   "metadata": {},
   "outputs": [],
   "source": [
-    "dictionary = ['program', 'application', 'applet' 'compile']"
+    "dictionary = ['program', 'application', 'applet', 'compile']"
   ]
  },
  {
@ -128,13 +136,41 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 17,
   "id": "cognitive-cedar",
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'program': [(468, 475), (516, 523), (533, 540)],\n",
+       " 'application': [(80, 91), (164, 175)],\n",
+       " 'compile': [(56, 63), (504, 511)]}"
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
   "source": [
-    "def terminology_lookup():\n",
-    "    return []"
+    "import re\n",
+    "\n",
+    "def terminology_lookup(dictionary, text):\n",
+    "    termValues = dict()\n",
+    "    for element in dictionary:\n",
+    "         values = []\n",
+    "         pattern = re.compile(r'\\b{}\\b'.format(re.escape(element)))\n",
+    "         for match in pattern.finditer(text.lower()):\n",
+    "            values.append((match.start(), match.end()))\n",
+    "         \n",
+    "         if len(values) != 0:\n",
+    "             termValues[element] = values\n",
+    "   \n",
+    "    return termValues\n",
+    "\n",
+    "terminology_lookup(dictionary, text)\n",
+    "\n"
   ]
  },
  {
@ -161,7 +197,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 18,
   "id": "tribal-attention",
   "metadata": {},
   "outputs": [
@ -205,7 +241,7 @@
      "IDE\n",
      ",\n",
      "see\n",
-      "Running\n",
+      "run\n",
      "Tutorial\n",
      "Examples\n",
      "in\n",
@ -218,7 +254,7 @@
      "work\n",
      "for\n",
      "all\n",
-      "swing\n",
+      "Swing\n",
      "program\n",
      "—\n",
      "applet\n",
@ -232,7 +268,7 @@
      "be\n",
      "the\n",
      "step\n",
-      "-PRON-\n",
+      "you\n",
      "need\n",
      "to\n",
      "follow\n",
@ -248,7 +284,7 @@
      "platform\n",
      ",\n",
      "if\n",
-      "-PRON-\n",
+      "you\n",
      "have\n",
      "not\n",
      "already\n",
@ -260,7 +296,7 @@
      "program\n",
      "that\n",
      "use\n",
-      "Swing\n",
+      "swing\n",
      "component\n",
      ".\n",
      "compile\n",
@ -302,13 +338,48 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 43,
   "id": "surgical-demonstration",
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'program': [(291, 299), (468, 475), (516, 523), (533, 540)],\n",
+       " 'application': [(80, 91), (164, 175), (322, 334)],\n",
+       " 'applet': [(302, 309)],\n",
+       " 'compile': [(56, 63), (134, 143), (504, 511)]}"
+      ]
+     },
+     "execution_count": 43,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
   "source": [
-    "def terminology_lookup():\n",
-    "    return []"
+    "def terminology_lookup(dictionary, text):\n",
+    "    termValues = dict()\n",
+    "    lowerText = text.lower()\n",
+    "    nlp = spacy.load(\"en_core_web_sm\")\n",
+    "\n",
+    "    splitText = nlp(lowerText)\n",
+    "    for findingWord in dictionary:\n",
+    "         values = []\n",
+    "         startFromIndex = 0\n",
+    "\n",
+    "         for word in splitText:\n",
+    "             if word.lemma_ == findingWord:\n",
+    "                 textBegining = lowerText.index(word.text,startFromIndex)\n",
+    "                 textEnding = textBegining + len(word)\n",
+    "                 startFromIndex = textEnding\n",
+    "                 values.append((textBegining,textEnding))\n",
+    "         \n",
+    "         if len(values) != 0:\n",
+    "             termValues[findingWord] = values\n",
+    "   \n",
+    "    return termValues\n",
+    "\n",
+    "terminology_lookup(dictionary, text)"
   ]
  },
  {
@ -337,13 +408,31 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 54,
   "id": "superb-butterfly",
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "set()"
+      ]
+     },
+     "execution_count": 54,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
   "source": [
+    "import spacy\n",
+    "\n",
    "def get_nouns(text):\n",
-    "    return []"
+    "    nlp = spacy.load(\"en_core_web_sm\")\n",
+    "    doc = nlp(text)\n",
+    "    nouns = [token.text for token in doc if token.pos_ == \"NOUN\"]\n",
+    "    return set(nouns)\n",
+    "\n",
+    "get_nouns(text)"
   ]
  },
  {
@ -374,13 +463,66 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 71,
   "id": "eight-redhead",
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'line': 1,\n",
+       " 'release': 1,\n",
+       " 'compilation': 1,\n",
+       " 'component': 1,\n",
+       " 'section': 1,\n",
+       " 'information': 1,\n",
+       " 'program': 4,\n",
+       " 'command': 1,\n",
+       " 'platform': 1,\n",
+       " 'applet': 1,\n",
+       " 'application': 3,\n",
+       " 'swing': 4,\n",
+       " 'instruction': 1,\n",
+       " 'step': 1,\n",
+       " 'programmer': 1}"
+      ]
+     },
+     "execution_count": 71,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
   "source": [
+    "import spacy\n",
+    "\n",
+    "def get_nouns(text):\n",
+    "    nlp = spacy.load(\"en_core_web_sm\")\n",
+    "    doc = nlp(text)\n",
+    "    nouns = [token.lemma_ for token in doc if token.pos_ == \"NOUN\"]\n",
+    "    return set(nouns)\n",
+    "\n",
+    "def getElementsNumbers(dictionary, text):\n",
+    "    termValues = dict()\n",
+    "    lowerText = text.lower()\n",
+    "    nlp = spacy.load(\"en_core_web_sm\")\n",
+    "\n",
+    "    splitText = nlp(lowerText)\n",
+    "    for findingWord in dictionary:\n",
+    "         elementNumber = 0\n",
+    "\n",
+    "         for word in splitText:\n",
+    "             if word.lemma_ == findingWord:\n",
+    "                elementNumber = elementNumber +1\n",
+    "         \n",
+    "         if elementNumber != 0:\n",
+    "             termValues[findingWord] = elementNumber\n",
+    "   \n",
+    "    return termValues\n",
+    "\n",
    "def extract_terms(text):\n",
-    "    return []"
+    "    return getElementsNumbers(get_nouns(text), text)\n",
+    "\n",
+    "extract_terms(text)"
   ]
  },
  {
@ -393,13 +535,75 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 86,
   "id": "monetary-mambo",
   "metadata": {},
   "outputs": [],
   "source": [
-    "def extract_terms(text):\n",
-    "    return []"
+    "def get_dictonery_by_type(text, type):\n",
+    "    nlp = spacy.load(\"en_core_web_sm\")\n",
+    "    doc = nlp(text)\n",
+    "    nouns = [token.lemma_ for token in doc if token.pos_ == type]\n",
+    "    return set(nouns)\n",
+    "\n",
+    "\n",
+    "def extract_terms(text, type):\n",
+    "     return getElementsNumbers(get_dictonery_by_type(text, type), text)\n",
+    "\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 87,
+   "id": "8f7eeb73",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'compile': 3,\n",
+       " 'work': 1,\n",
+       " 'install': 1,\n",
+       " 'create': 1,\n",
+       " 'explain': 1,\n",
+       " 'run': 4,\n",
+       " 'see': 1,\n",
+       " 'need': 1,\n",
+       " 'do': 1,\n",
+       " 'follow': 1,\n",
+       " 'use': 2}"
+      ]
+     },
+     "execution_count": 87,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "extract_terms(text, 'VERB')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 93,
+   "id": "71c14cab",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'late': 1}"
+      ]
+     },
+     "execution_count": 93,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "extract_terms(text, 'ADJ')"
   ]
  }
 ],
@ -422,7 +626,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.8.10"
+   "version": "3.11.7"
  },
  "subtitle": "3. Terminologia",
  "title": "Komputerowe wspomaganie tłumaczenia",