From f4e46c4588c138e6d7232605be5ec02567f2735f Mon Sep 17 00:00:00 2001 From: Jakub Pokrywka Date: Tue, 22 Mar 2022 21:56:40 +0100 Subject: [PATCH] add 03 --- cw/03a_tfidf.ipynb | 137 +++++------------------ cw/03b_tfidf_newsgroup.ipynb | 209 +++++++++++++++++++++++++++++++++-- 2 files changed, 228 insertions(+), 118 deletions(-) diff --git a/cw/03a_tfidf.ipynb b/cw/03a_tfidf.ipynb index 292afe7..bb85f01 100644 --- a/cw/03a_tfidf.ipynb +++ b/cw/03a_tfidf.ipynb @@ -64,6 +64,14 @@ "- czy możemy ztokenizować tekst np. documents.split(' ') jakie wystąpią wtedy problemy?" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### ODPOWIEDŹ\n", + "- lepiej użyć preprocessingu i dopiero później tokenizacji" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -295,15 +303,6 @@ "vocabulary" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## PYTANIA\n", - "\n", - "jak będzie słowo \"jak\" w reprezentacji wektorowej TF?" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -359,13 +358,6 @@ " pass" ] }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [], - "source": [] - }, { "cell_type": "code", "execution_count": 19, @@ -860,6 +852,9 @@ ], "source": [ "# dlatego potrzebujemy mianownik w cosine similarity\n", + "# dłuższe dokumenty, w który raz wystąpie słowo rower są gorzej punktowane od\n", + "# krótszych. Jeżeli słowo rower wystąpiło w bardzo krótki dokumencie, to znaczy\n", + "# że jest większe prawdopodobieństwo że dokument jest o rowerze\n", "query = 'rowerze'\n", "for i in range(len(documents)):\n", " display(documents[i])\n", @@ -965,7 +960,8 @@ } ], "source": [ - "# dlatego potrzebujemy term frequency → wiecej znaczy bardziej dopasowany dokument\n", + "# dlatego potrzebujemy term frequency → wiecej wystąpień słowa w dokumencie\n", + "# znaczy bardziej dopasowany dokument\n", "query = 'i'\n", "for i in range(len(documents)):\n", " display(documents[i])\n", @@ -974,104 +970,26 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 3, "metadata": { - "scrolled": true + "scrolled": false }, "outputs": [ { - "data": { - "text/plain": [ - "'Ala lubi zwierzęta i ma kota oraz psa!'" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "0.24999999999999994" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "'Ola lubi zwierzęta oraz ma kota a także chomika!'" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "0.2357022603955158" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "'I Jan jeździ na rowerze.'" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "0.31622776601683794" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "'2 wojna światowa była wielkim konfliktem zbrojnym'" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "0.0" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "'Tomek lubi psy, ma psa i jeździ na motorze i rowerze.'" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "0.39223227027636803" - ] - }, - "metadata": {}, - "output_type": "display_data" + "ename": "NameError", + "evalue": "name 'documents' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;31m# słowo chomik ma większą wagę od i, ponieważ występuje w mniejszej ilości dokumentów\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mquery\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'i chomika'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdocuments\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 5\u001b[0m \u001b[0mdisplay\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdocuments\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0mdisplay\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msimilarity\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtransform_query\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mquery\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdocuments_vectorized\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mNameError\u001b[0m: name 'documents' is not defined" + ] } ], "source": [ "# dlatego IDF - żeby ważniejsze słowa miał większą wagę\n", + "# słowo chomik ma większą wagę od i, ponieważ występuje w mniejszej ilości dokumentów\n", "query = 'i chomika'\n", "for i in range(len(documents)):\n", " display(documents[i])\n", @@ -1081,7 +999,10 @@ { "cell_type": "markdown", "metadata": {}, - "source": [] + "source": [ + "### Uwaga\n", + "Powyższe przykłady pokazują score dokuemntu. Aby zrobić wyszukiwarkę, powinniśmy posortować te dokumenty po score (od największego) i zaprezentwoać w tej kolejności." + ] } ], "metadata": { diff --git a/cw/03b_tfidf_newsgroup.ipynb b/cw/03b_tfidf_newsgroup.ipynb index 1967e81..05f3b7d 100644 --- a/cw/03b_tfidf_newsgroup.ipynb +++ b/cw/03b_tfidf_newsgroup.ipynb @@ -341,9 +341,9 @@ "metadata": {}, "outputs": [], "source": [ - "query_str = 'speed'\n", + "#query_str = 'speed'\n", "#query_str = 'speed car'\n", - "#query_str = 'spider man'" + "query_str = 'spider man'" ] }, { @@ -385,7 +385,7 @@ "data": { "text/plain": [ "<1x130107 sparse matrix of type ''\n", - "\twith 1 stored elements in Compressed Sparse Row format>" + "\twith 2 stored elements in Compressed Sparse Row format>" ] }, "execution_count": 17, @@ -414,7 +414,7 @@ { "data": { "text/plain": [ - "array([0.26949927, 0.3491801 , 0.44292083, 0.47784165])" + "array([0.17360013, 0.22933014, 0.28954818, 0.45372239])" ] }, "execution_count": 19, @@ -434,7 +434,7 @@ { "data": { "text/plain": [ - "array([4517, 5509, 2116, 9921])" + "array([ 2455, 8920, 5497, 11031])" ] }, "execution_count": 20, @@ -448,11 +448,185 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 21, "metadata": { "scrolled": true }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "From: keiths@spider.co.uk (Keith Smith)\n", + "Subject: win/NT file systems\n", + "Organization: Spider Systems Limited, Edinburgh, UK.\n", + "Lines: 6\n", + "Nntp-Posting-Host: trapdoor.spider.co.uk\n", + "\n", + "OK will some one out there tell me why / how DOS 5\n", + "can read (I havn't tried writing in case it breaks something)\n", + "the Win/NT NTFS file system.\n", + "I thought NTFS was supposed to be better than the FAT system\n", + "\n", + "keith\n", + "\n", + "0.4537223924558256\n", + "----------------------------------------------------------------------------------------------------\n", + "----------------------------------------------------------------------------------------------------\n", + "----------------------------------------------------------------------------------------------------\n", + "From: brandt@cs.unc.edu (Andrew Brandt)\n", + "Subject: Seeking good Alfa Romeo mechanic.\n", + "Organization: The University of North Carolina at Chapel Hill\n", + "Lines: 14\n", + "NNTP-Posting-Host: axon.cs.unc.edu\n", + "Keywords: alfa, romeo, spider, mechanic\n", + "\n", + "I am looking for recommendations for a good (great?) Alfa Romeo\n", + "mechanic in South Jersey or Philadelphia or nearby.\n", + "\n", + "I have a '78 Alfa Spider that needs some engine, tranny, steering work\n", + "done. The body is in quite good shape. The car is awful in cold\n", + "weather, won't start if below freezing (I know, I know, why drive a\n", + "Spider if there's snow on the ground ...). It has Bosch *mechanical*\n", + "fuel injection that I am sure needs adjustment.\n", + "\n", + "Any opinions are welcome on what to look for or who to call.\n", + "\n", + "Email or post (to rec.autos), I will summarize if people want.\n", + "\n", + "Thx, Andy (brandt@cs.unc.edu)\n", + "\n", + "0.28954817869991817\n", + "----------------------------------------------------------------------------------------------------\n", + "----------------------------------------------------------------------------------------------------\n", + "----------------------------------------------------------------------------------------------------\n", + "From: michaelr@spider.co.uk (Michael S. A. Robb)\n", + "Subject: Re: Honors Degrees: Do they mean anything?\n", + "Organization: Spider Systems Limited, Edinburgh, UK.\n", + "Lines: 44\n", + "\n", + "In article tkld@cogsci.ed.ac.uk (Kevin Davidson) writes:\n", + ">\n", + ">> In my opinion, a programming degree is still worth having.\n", + ">\n", + "> Yes, but a CS degree is *not* a programming degree. Does anybody know of\n", + ">a computing course where *programming* is taught ? Computer Science is\n", + ">a branch of maths (or the course I did was).\n", + "> I've also done a Software Engineering course - much more practical and likely\n", + ">to be the sort of thing an employer really wants, rather than what they think\n", + ">they want, but also did not teach programming. The ability to program was\n", + ">an entry requirement.\n", + "\n", + "At Robert Gordon University, programming was the main (most time-consuming) \n", + "start of the course. The first two years consisted of five subjects:\n", + "Software Engineering (Pascal/C/UNIX), Computer Engineering (6502/6809/68000 \n", + "assembler), Computer Theory (LISP/Prolog), Mathematics/Statistics and \n", + "Communication Skills (How to pass interviews/intelligence tests and group\n", + "discussions e.g. How to survive a helicopter crash in the North Sea).\n", + "The third year (Industrial placement) was spent working for a computer company \n", + "for a year. The company could be anywhere in Europe (there was a special \n", + "Travel Allowance Scheme to cover the visiting costs of professors). \n", + "The fourth year included Operating Systems(C/Modula-2), Software Engineering \n", + "(C/8086 assembler), Real Time Laboratory (C/68000 assembler) and Computing \n", + "Theory (LISP). There were also Group Projects in 2nd and 4th Years, where \n", + "students worked in teams to select their own project or decide to work for an \n", + "outside company (the only disadvantage being that specifications would change \n", + "suddenly).\n", + " \n", + "In the first four years, there was a 50%:50% weighting between courseworks and \n", + "exams for most subjects. However in the Honours year, this was reduced to a \n", + "30%:70% split between an Individual Project and final exams (no coursework \n", + "assessment) - are all Computer Science courses like this?\n", + "\n", + "BTW - we started off with 22 students in our first year and were left with 8 by\n", + "Honours year. Also, every course is tutored separately. Not easy trying\n", + "to sleep when you are in 8 student class :-). \n", + "\n", + "Cheers,\n", + " Michael \n", + "-- \n", + "| Michael S. A. Robb | Tel: +44 31 554 9424 | \"..The problem with bolt-on\n", + "| Software Engineer | Fax: +44 31 554 0649 | software is making sure the\n", + "| Spider Systems Limited | E-mail: | bolts are the right size..\"\n", + "| Edinburgh, EH6 5NG | michaelr@spider.co.uk | - Anonymous\n", + "\n", + "0.22933013891071233\n", + "----------------------------------------------------------------------------------------------------\n", + "----------------------------------------------------------------------------------------------------\n", + "----------------------------------------------------------------------------------------------------\n", + "From: jrm@elm.circa.ufl.edu (Jeff Mason)\n", + "Subject: AUCTION: Marvel, DC, Valiant, Image, Dark Horse, etc...\n", + "Organization: Univ. of Florida Psychology Dept.\n", + "Lines: 59\n", + "NNTP-Posting-Host: elm.circa.ufl.edu\n", + "\n", + "I am auctioning off the following comics. These minimum bids are set\n", + "below what I would normally sell them for. Make an offer, and I will\n", + "accept the highest bid after the auction has been completed.\n", + "\n", + "TITLE Minimum/Current \n", + "--------------------------------------------------------------\n", + "Alpha Flight 51 (Jim Lee's first work at Marvel)\t$ 5.00\n", + "Aliens 1 (1st app Aliens in comics, 1st prnt, May 1988)\t$20.00\n", + "Amazing Spider-Man 136 (Intro new Green Goblin) $20.00\n", + "Amazing Spider-Man 238 (1st appearance Hobgoblin)\t$50.00\n", + "Archer and Armstrong 1 (Frank Miller/Smith/Layton)\t$ 7.50\n", + "Avengers 263 (1st appearance X-factor) $ 3.50\n", + "Bloodshot 1 (Chromium cover, BWSmith Cover/Poster)\t$ 5.00\n", + "Daredevil 158 (Frank Miller art begins) $35.00\n", + "Dark Horse Presents 1 (1st app Concrete, 1st printing)\t$ 7.50 \n", + "H.A.R.D. Corps 1 \t\t\t\t\t$ 5.00\n", + "Incredible Hulk 324 (1st app Grey Hulk since #1, 1962)\t$ 7.50\n", + "Incredible Hulk 330 (1st McFarlane issue)\t\t$15.00\n", + "Incredible Hulk 331 (Grey Hulk series begins)\t\t$11.20\t\n", + "Incredible Hulk 367 (1st Dale Keown art in Hulk) $15.00\n", + "Incredible Hulk 377 (1st all new hulk, 1st prnt, Keown) $15.00\n", + "Marvel Comics Presents 1 (Wolverine, Silver Surfer) $ 7.50\n", + "Maxx Limited Ashcan (4000 copies exist, blue cover)\t$30.00\n", + "New Mutants 86 (McFarlane cover, 1st app Cable - cameo)\t$10.00\n", + "New Mutants 100 (1st app X-Force) $ 5.00\n", + "New Mutants Annual 5 (1st Liefeld art on New Mutants)\t$10.00\n", + "Omega Men 3 (1st appearance Lobo) $ 7.50\n", + "Omega Men 10 (1st full Lobo story) $ 7.50\n", + "Power Man & Iron Fist 78 (3rd appearance Sabretooth) $25.00\n", + " 84 (4th appearance Sabretooth) $20.00\n", + "Simpsons Comics and Stories 1 (Polybagged special ed.)\t$ 7.50\n", + "Spectacular Spider-Man 147 (1st app New Hobgoblin) $12.50\n", + "Star Trek the Next Generation 1 (Feb 1988, DC mini) $ 7.50\n", + "Star Trek the Next Generation 1 (Oct 1989, DC comics) $ 7.50\n", + "Web of Spider-Man 29 (Hobgoblin, Wolverine appear) $10.00 \n", + "Web of Spider-Man 30 (Origin Rose, Hobgoblin appears) $ 7.50\n", + "Wolverine 10 (Before claws, 1st battle with Sabretooth)\t$15.00\n", + "Wolverine 41 (Sabretooth claims to be Wolverine's dad)\t$ 5.00\n", + "Wolverine 42 (Sabretooth proven not to be his dad)\t$ 3.50\n", + "Wolverine 43 (Sabretooth/Wolverine saga concludes)\t$ 3.00\n", + "Wolverine 1 (1982 mini-series, Miller art)\t\t$20.00\n", + "Wonder Woman 267 (Return of Animal Man) $12.50\n", + "X-Force 1 (Signed by Liefeld, Bagged, X-Force card) $20.00\n", + "X-Force 1 (Signed by Liefeld, Bagged, Shatterstar card) $10.00\n", + "X-Force 1 (Signed by Liefeld, Bagged, Deadpool card) $10.00\n", + "X-Force 1 (Signed by Liefeld, Bagged, Sunspot/Gideon) $10.00\n", + "\n", + "All comics are in near mint to mint condition, are bagged in shiny \n", + "polypropylene bags, and backed with white acid free boards. Shipping is\n", + "$1.50 for one book, $3.00 for more than one book, or free if you order \n", + "a large enough amount of stuff. I am willing to haggle.\n", + "\n", + "I have thousands and thousands of other comics, so please let me know what \n", + "you've been looking for, and maybe I can help. Some titles I have posted\n", + "here don't list every issue I have of that title, I tried to save space.\n", + "-- \n", + "Geoffrey R. Mason\t\t|\tjrm@elm.circa.ufl.edu\n", + "Department of Psychology\t|\tmason@webb.psych.ufl.edu\n", + "University of Florida\t\t|\tprothan@maple.circa.ufl.edu\n", + "\n", + "0.17360012846950526\n", + "----------------------------------------------------------------------------------------------------\n", + "----------------------------------------------------------------------------------------------------\n", + "----------------------------------------------------------------------------------------------------\n" + ] + } + ], "source": [ "for i in range (1,5):\n", " print(newsgroups[similarities.argsort()[0][-i]])\n", @@ -462,6 +636,14 @@ " print('-'*100)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### analiza\n", + "Dla frazy \"spider man\" (komórka 14) wynik zapytania jest niesatysfakcjonujący, ponieważ pierwszy artykuł nie jest o spider-man'ie, ale zawiera tylko słowa \"spider\". Po zmianie metody wektoryzacji (komórka 8) jako pierwszy wynik pojawia się istotnie film o spider manie (proszę to sprawdzić samodzielnie). Wynika to z faktu, że używamy również bigramów. W ten sposób poprawiliśmy wyszukiwarkę dla tego konkretnego przykładu (chociaż nie wiemy czy nie popsuliśmy wyszukiwarki w innym przypadku- w tym ćwiczeniu nie przejmujemy się tym)" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -469,10 +651,10 @@ "## Zadanie domowe\n", "\n", "\n", - "- Wybrać zbiór tekstowy, który ma conajmniej 10000 dokumentów (inny niż w tym przykładzie).\n", - "- Na jego podstawie stworzyć wyszukiwarkę bazującą na OKAPI BM25, tzn. system który dla podanej frazy podaje kilka (5-10) posortowanych najbardziej pasujących dokumentów razem ze scorami. Należy wypisywać też ilość zwracanych dokumentów, czyli takich z niezerowym scorem. Można korzystać z gotowych bibliotek do wektoryzacji dokumentów, należy jednak samemu zaimplementować OKAPI BM25. \n", + "- Wybrać zbiór tekstowy, który ma conajmniej 10_000 dokumentów (inny niż w tym przykładzie).\n", + "- Na jego podstawie stworzyć wyszukiwarkę bazującą na OKAPI BM25, tzn. system który dla podanej frazy podaje kilka (5-10) posortowanych najbardziej pasujących dokumentów razem ze scorami. Należy wypisywać też ilość zwracanych dokumentów, czyli takich z niezerowym scorem. Można korzystać z gotowych bibliotek do wektoryzacji dokumentów, należy jednak samemu zaimplementować OKAPI BM25. Można użyć dowolnych parametrów TF-IDF\n", "- Znaleźć frazę (query), dla której wynik nie jest satysfakcjonujący.\n", - "- Poprawić wyszukiwarkę (np. poprzez zmianę preprocessingu tekstu, wektoryzer, zmianę parametrów algorytmu rankującego lub sam algorytm) tak, żeby zwracała satysfakcjonujące wyniki dla poprzedniej frazy. Należy zrobić inną zmianę niż w tym przykładzie, tylko wymyślić coś własnego.\n", + "- Poprawić wyszukiwarkę (np. poprzez zmianę preprocessingu tekstu, wektoryzer, zmianę parametrów algorytmu rankującego lub sam algorytm) tak, żeby zwracała satysfakcjonujące wyniki dla poprzedniej frazy. Należy zrobić inną zmianę niż w powyższym przykładzie (czyli coś innego niż użycie bigramów), tylko wymyślić coś własnego.\n", "- prezentować pracę na zajęciach (06.04) odpowiadając na pytania:\n", " - jak wygląda zbiór i system wyszukiwania przed zmianami\n", " - dla jakiej frazy wyniki są niesatysfakcjonujące (pokazać wyniki)\n", @@ -483,6 +665,13 @@ "Prezentacja powinna być maksymalnie prosta i trwać maksymalnie 2-3 minuty.\n", "punktów do zdobycia: 70\n" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": {