diff --git a/lab/lab_03.ipynb b/lab/lab_03.ipynb index 5707f0d..9d562da 100644 --- a/lab/lab_03.ipynb +++ b/lab/lab_03.ipynb @@ -86,7 +86,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 55, "id": "loving-prince", "metadata": {}, "outputs": [], @@ -100,6 +100,14 @@ "text += \" Create a program that uses Swing components. Compile the program. Run the program.\"" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "05436dad", + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "markdown", "id": "extreme-cycling", @@ -110,12 +118,12 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 56, "id": "bound-auction", "metadata": {}, "outputs": [], "source": [ - "dictionary = ['program', 'application', 'applet' 'compile']" + "dictionary = ['program', 'application', 'applet', 'compile']" ] }, { @@ -128,13 +136,41 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 17, "id": "cognitive-cedar", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "{'program': [(468, 475), (516, 523), (533, 540)],\n", + " 'application': [(80, 91), (164, 175)],\n", + " 'compile': [(56, 63), (504, 511)]}" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "def terminology_lookup():\n", - " return []" + "import re\n", + "\n", + "def terminology_lookup(dictionary, text):\n", + " termValues = dict()\n", + " for element in dictionary:\n", + " values = []\n", + " pattern = re.compile(r'\\b{}\\b'.format(re.escape(element)))\n", + " for match in pattern.finditer(text.lower()):\n", + " values.append((match.start(), match.end()))\n", + " \n", + " if len(values) != 0:\n", + " termValues[element] = values\n", + " \n", + " return termValues\n", + "\n", + "terminology_lookup(dictionary, text)\n", + "\n" ] }, { @@ -161,7 +197,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 18, "id": "tribal-attention", "metadata": {}, "outputs": [ @@ -205,7 +241,7 @@ "IDE\n", ",\n", "see\n", - "Running\n", + "run\n", "Tutorial\n", "Examples\n", "in\n", @@ -218,7 +254,7 @@ "work\n", "for\n", "all\n", - "swing\n", + "Swing\n", "program\n", "—\n", "applet\n", @@ -232,7 +268,7 @@ "be\n", "the\n", "step\n", - "-PRON-\n", + "you\n", "need\n", "to\n", "follow\n", @@ -248,7 +284,7 @@ "platform\n", ",\n", "if\n", - "-PRON-\n", + "you\n", "have\n", "not\n", "already\n", @@ -260,7 +296,7 @@ "program\n", "that\n", "use\n", - "Swing\n", + "swing\n", "component\n", ".\n", "compile\n", @@ -302,13 +338,48 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 43, "id": "surgical-demonstration", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "{'program': [(291, 299), (468, 475), (516, 523), (533, 540)],\n", + " 'application': [(80, 91), (164, 175), (322, 334)],\n", + " 'applet': [(302, 309)],\n", + " 'compile': [(56, 63), (134, 143), (504, 511)]}" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "def terminology_lookup():\n", - " return []" + "def terminology_lookup(dictionary, text):\n", + " termValues = dict()\n", + " lowerText = text.lower()\n", + " nlp = spacy.load(\"en_core_web_sm\")\n", + "\n", + " splitText = nlp(lowerText)\n", + " for findingWord in dictionary:\n", + " values = []\n", + " startFromIndex = 0\n", + "\n", + " for word in splitText:\n", + " if word.lemma_ == findingWord:\n", + " textBegining = lowerText.index(word.text,startFromIndex)\n", + " textEnding = textBegining + len(word)\n", + " startFromIndex = textEnding\n", + " values.append((textBegining,textEnding))\n", + " \n", + " if len(values) != 0:\n", + " termValues[findingWord] = values\n", + " \n", + " return termValues\n", + "\n", + "terminology_lookup(dictionary, text)" ] }, { @@ -337,13 +408,31 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 54, "id": "superb-butterfly", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "set()" + ] + }, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ + "import spacy\n", + "\n", "def get_nouns(text):\n", - " return []" + " nlp = spacy.load(\"en_core_web_sm\")\n", + " doc = nlp(text)\n", + " nouns = [token.text for token in doc if token.pos_ == \"NOUN\"]\n", + " return set(nouns)\n", + "\n", + "get_nouns(text)" ] }, { @@ -374,13 +463,66 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 71, "id": "eight-redhead", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "{'line': 1,\n", + " 'release': 1,\n", + " 'compilation': 1,\n", + " 'component': 1,\n", + " 'section': 1,\n", + " 'information': 1,\n", + " 'program': 4,\n", + " 'command': 1,\n", + " 'platform': 1,\n", + " 'applet': 1,\n", + " 'application': 3,\n", + " 'swing': 4,\n", + " 'instruction': 1,\n", + " 'step': 1,\n", + " 'programmer': 1}" + ] + }, + "execution_count": 71, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ + "import spacy\n", + "\n", + "def get_nouns(text):\n", + " nlp = spacy.load(\"en_core_web_sm\")\n", + " doc = nlp(text)\n", + " nouns = [token.lemma_ for token in doc if token.pos_ == \"NOUN\"]\n", + " return set(nouns)\n", + "\n", + "def getElementsNumbers(dictionary, text):\n", + " termValues = dict()\n", + " lowerText = text.lower()\n", + " nlp = spacy.load(\"en_core_web_sm\")\n", + "\n", + " splitText = nlp(lowerText)\n", + " for findingWord in dictionary:\n", + " elementNumber = 0\n", + "\n", + " for word in splitText:\n", + " if word.lemma_ == findingWord:\n", + " elementNumber = elementNumber +1\n", + " \n", + " if elementNumber != 0:\n", + " termValues[findingWord] = elementNumber\n", + " \n", + " return termValues\n", + "\n", "def extract_terms(text):\n", - " return []" + " return getElementsNumbers(get_nouns(text), text)\n", + "\n", + "extract_terms(text)" ] }, { @@ -393,13 +535,75 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 86, "id": "monetary-mambo", "metadata": {}, "outputs": [], "source": [ - "def extract_terms(text):\n", - " return []" + "def get_dictonery_by_type(text, type):\n", + " nlp = spacy.load(\"en_core_web_sm\")\n", + " doc = nlp(text)\n", + " nouns = [token.lemma_ for token in doc if token.pos_ == type]\n", + " return set(nouns)\n", + "\n", + "\n", + "def extract_terms(text, type):\n", + " return getElementsNumbers(get_dictonery_by_type(text, type), text)\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "id": "8f7eeb73", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'compile': 3,\n", + " 'work': 1,\n", + " 'install': 1,\n", + " 'create': 1,\n", + " 'explain': 1,\n", + " 'run': 4,\n", + " 'see': 1,\n", + " 'need': 1,\n", + " 'do': 1,\n", + " 'follow': 1,\n", + " 'use': 2}" + ] + }, + "execution_count": 87, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "extract_terms(text, 'VERB')" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "id": "71c14cab", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'late': 1}" + ] + }, + "execution_count": 93, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "extract_terms(text, 'ADJ')" ] } ], @@ -422,7 +626,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.10" + "version": "3.11.7" }, "subtitle": "3. Terminologia", "title": "Komputerowe wspomaganie tłumaczenia",