diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..adefade Binary files /dev/null and b/.DS_Store differ diff --git a/lab/lab_03.ipynb b/lab/lab_03.ipynb index 4b3b331..f6e8fef 100644 --- a/lab/lab_03.ipynb +++ b/lab/lab_03.ipynb @@ -63,7 +63,7 @@ "id": "diverse-sunglasses", "metadata": {}, "source": [ - "Odpowiedź:" + "Odpowiedź: metal cabinet guides lub metal cabinet slides. Skorzystalem z dwoch slownikow oraz duzego modelu jezykowego." ] }, { @@ -86,7 +86,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 102, "id": "loving-prince", "metadata": {}, "outputs": [], @@ -110,7 +110,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 103, "id": "bound-auction", "metadata": {}, "outputs": [], @@ -128,7 +128,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 104, "id": "cognitive-cedar", "metadata": {}, "outputs": [], @@ -149,7 +149,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 105, "id": "7cc3ad1f", "metadata": {}, "outputs": [ @@ -162,7 +162,7 @@ " ('compile', [(56, 62)])]" ] }, - "execution_count": 34, + "execution_count": 105, "metadata": {}, "output_type": "execute_result" } @@ -195,7 +195,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 106, "id": "tribal-attention", "metadata": {}, "outputs": [ @@ -203,108 +203,108 @@ "name": "stdout", "output_type": "stream", "text": [ - " \n", - "for\n", - "all\n", - "Java\n", - "programmer\n", - ":\n", - "this\n", - "section\n", - "explain\n", - "how\n", - "to\n", - "compile\n", - "and\n", - "run\n", - "a\n", - "swing\n", - "application\n", - "from\n", - "the\n", - "command\n", - "line\n", - ".\n", - "for\n", - "information\n", - "on\n", - "compile\n", - "and\n", - "run\n", - "a\n", - "swing\n", - "application\n", - "use\n", - "NetBeans\n", - "IDE\n", - ",\n", - "see\n", - "run\n", - "Tutorial\n", - "Examples\n", - "in\n", - "NetBeans\n", - "IDE\n", - ".\n", - "the\n", - "compilation\n", - "instruction\n", - "work\n", - "for\n", - "all\n", - "Swing\n", - "program\n", - "—\n", - "applet\n", - ",\n", - "as\n", - "well\n", - "as\n", - "application\n", - ".\n", - "here\n", - "be\n", - "the\n", - "step\n", - "you\n", - "need\n", - "to\n", - "follow\n", - ":\n", - "install\n", - "the\n", - "late\n", - "release\n", - "of\n", - "the\n", - "Java\n", - "SE\n", - "platform\n", - ",\n", - "if\n", - "you\n", - "have\n", - "not\n", - "already\n", - "do\n", - "so\n", - ".\n", - "create\n", - "a\n", - "program\n", - "that\n", - "use\n", - "swing\n", - "component\n", - ".\n", - "compile\n", - "the\n", - "program\n", - ".\n", - "run\n", - "the\n", - "program\n", - ".\n" + " 0\n", + "For for 1\n", + "all all 5\n", + "Java Java 9\n", + "programmers programmer 14\n", + ": : 25\n", + "This this 27\n", + "section section 32\n", + "explains explain 40\n", + "how how 49\n", + "to to 53\n", + "compile compile 56\n", + "and and 64\n", + "run run 68\n", + "a a 72\n", + "Swing swing 74\n", + "application application 80\n", + "from from 92\n", + "the the 97\n", + "command command 101\n", + "line line 109\n", + ". . 113\n", + "For for 115\n", + "information information 119\n", + "on on 131\n", + "compiling compile 134\n", + "and and 144\n", + "running run 148\n", + "a a 156\n", + "Swing swing 158\n", + "application application 164\n", + "using use 176\n", + "NetBeans NetBeans 182\n", + "IDE IDE 191\n", + ", , 194\n", + "see see 196\n", + "Running run 200\n", + "Tutorial Tutorial 208\n", + "Examples Examples 217\n", + "in in 226\n", + "NetBeans NetBeans 229\n", + "IDE IDE 238\n", + ". . 241\n", + "The the 243\n", + "compilation compilation 247\n", + "instructions instruction 259\n", + "work work 272\n", + "for for 277\n", + "all all 281\n", + "Swing Swing 285\n", + "programs program 291\n", + "— — 300\n", + "applets applet 302\n", + ", , 309\n", + "as as 311\n", + "well well 314\n", + "as as 319\n", + "applications application 322\n", + ". . 334\n", + "Here here 336\n", + "are be 341\n", + "the the 345\n", + "steps step 349\n", + "you you 355\n", + "need need 359\n", + "to to 364\n", + "follow follow 367\n", + ": : 373\n", + "Install install 375\n", + "the the 383\n", + "latest late 387\n", + "release release 394\n", + "of of 402\n", + "the the 405\n", + "Java Java 409\n", + "SE SE 414\n", + "platform platform 417\n", + ", , 425\n", + "if if 427\n", + "you you 430\n", + "have have 434\n", + "n't not 438\n", + "already already 442\n", + "done do 450\n", + "so so 455\n", + ". . 457\n", + "Create create 459\n", + "a a 466\n", + "program program 468\n", + "that that 476\n", + "uses use 481\n", + "Swing swing 486\n", + "components component 492\n", + ". . 502\n", + "Compile compile 504\n", + "the the 512\n", + "program program 516\n", + ". . 523\n", + "Run run 525\n", + "the the 529\n", + "program program 533\n", + ". . 540\n" ] } ], @@ -315,7 +315,7 @@ "doc = nlp(text)\n", "\n", "for token in doc:\n", - " print(token.lemma_)" + " print(token, token.lemma_, token.idx)" ] }, { @@ -336,13 +336,50 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 107, "id": "surgical-demonstration", "metadata": {}, "outputs": [], "source": [ - "def terminology_lookup():\n", - " return []" + "import spacy\n", + "nlp = spacy.load(\"en_core_web_sm\")\n", + "\n", + "\n", + "def terminology_lookup(txt, labels):\n", + " result = {};\n", + " doc = nlp(txt)\n", + "\n", + " for token in doc:\n", + " if token.lemma_ in labels: \n", + " if token.lemma_ not in result:\n", + " result[token.lemma_] = []\n", + " result[token.lemma_].append((token.idx, token.idx + len(token)))\n", + "\n", + " return result" + ] + }, + { + "cell_type": "code", + "execution_count": 108, + "id": "4772c1b1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'compile': [(56, 63), (134, 143), (504, 511)],\n", + " 'application': [(80, 91), (164, 175), (322, 334)],\n", + " 'program': [(291, 299), (468, 475), (516, 523), (533, 540)],\n", + " 'applet': [(302, 309)]}" + ] + }, + "execution_count": 108, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "terminology_lookup(text, dictionary)" ] }, { @@ -371,13 +408,56 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 109, "id": "superb-butterfly", "metadata": {}, "outputs": [], "source": [ "def get_nouns(text):\n", - " return []" + " doc = nlp(text)\n", + " return [token.lemma_ for token in doc if token.pos_ == 'NOUN']" + ] + }, + { + "cell_type": "code", + "execution_count": 110, + "id": "3c916a3e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['programmer',\n", + " 'section',\n", + " 'swing',\n", + " 'application',\n", + " 'command',\n", + " 'line',\n", + " 'information',\n", + " 'swing',\n", + " 'application',\n", + " 'compilation',\n", + " 'instruction',\n", + " 'program',\n", + " 'applet',\n", + " 'application',\n", + " 'step',\n", + " 'release',\n", + " 'platform',\n", + " 'program',\n", + " 'swing',\n", + " 'component',\n", + " 'program',\n", + " 'program']" + ] + }, + "execution_count": 110, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "get_nouns(text)" ] }, { @@ -390,7 +470,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 111, "id": "acting-tolerance", "metadata": {}, "outputs": [], @@ -408,13 +488,57 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 112, "id": "eight-redhead", "metadata": {}, "outputs": [], "source": [ + "def count_words(words):\n", + " word_count = {}\n", + " for word in words:\n", + " if word in word_count:\n", + " word_count[word] += 1\n", + " else:\n", + " word_count[word] = 1\n", + " return word_count\n", + "\n", "def extract_terms(text):\n", - " return []" + " return count_words(get_nouns(text))" + ] + }, + { + "cell_type": "code", + "execution_count": 113, + "id": "374550d8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'programmer': 1,\n", + " 'section': 1,\n", + " 'swing': 3,\n", + " 'application': 3,\n", + " 'command': 1,\n", + " 'line': 1,\n", + " 'information': 1,\n", + " 'compilation': 1,\n", + " 'instruction': 1,\n", + " 'program': 4,\n", + " 'applet': 1,\n", + " 'step': 1,\n", + " 'release': 1,\n", + " 'platform': 1,\n", + " 'component': 1}" + ] + }, + "execution_count": 113, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "extract_terms(text)" ] }, { @@ -427,13 +551,85 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 114, "id": "monetary-mambo", "metadata": {}, "outputs": [], "source": [ + "def get_verbs(text):\n", + " doc = nlp(text)\n", + " return [token.lemma_ for token in doc if token.pos_ == 'VERB']\n", + "\n", + "def get_adjectives(text):\n", + " doc = nlp(text)\n", + " return [token.lemma_ for token in doc if token.pos_ == 'ADJ']\n", + "\n", "def extract_terms(text):\n", - " return []" + " return {\n", + " \"nouns\": get_nouns(text),\n", + " \"verbs\": get_verbs(text),\n", + " \"adjectives\": get_adjectives(text)\n", + " }" + ] + }, + { + "cell_type": "code", + "execution_count": 115, + "id": "95494ac9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'nouns': ['programmer',\n", + " 'section',\n", + " 'swing',\n", + " 'application',\n", + " 'command',\n", + " 'line',\n", + " 'information',\n", + " 'swing',\n", + " 'application',\n", + " 'compilation',\n", + " 'instruction',\n", + " 'program',\n", + " 'applet',\n", + " 'application',\n", + " 'step',\n", + " 'release',\n", + " 'platform',\n", + " 'program',\n", + " 'swing',\n", + " 'component',\n", + " 'program',\n", + " 'program'],\n", + " 'verbs': ['explain',\n", + " 'compile',\n", + " 'run',\n", + " 'compile',\n", + " 'run',\n", + " 'use',\n", + " 'see',\n", + " 'run',\n", + " 'work',\n", + " 'need',\n", + " 'follow',\n", + " 'install',\n", + " 'do',\n", + " 'create',\n", + " 'use',\n", + " 'compile',\n", + " 'run'],\n", + " 'adjectives': ['late']}" + ] + }, + "execution_count": 115, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "extract_terms(text)" ] } ],