03
This commit is contained in:
parent
67e6944843
commit
11ce1b1e10
1109
cw/03a_tfidf.ipynb
Normal file
1109
cw/03a_tfidf.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
89
cw/03a_tfidf_ODPOWIEDZI.ipynb
Normal file
89
cw/03a_tfidf_ODPOWIEDZI.ipynb
Normal file
@ -0,0 +1,89 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"![Logo 1](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech1.jpg)\n",
|
||||
"<div class=\"alert alert-block alert-info\">\n",
|
||||
"<h1> Ekstrakcja informacji </h1>\n",
|
||||
"<h2> 3. <i>tfidf (1)</i> [ćwiczenia]</h2> \n",
|
||||
"<h3> Jakub Pokrywka (2021)</h3>\n",
|
||||
"</div>\n",
|
||||
"\n",
|
||||
"![Logo 2](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech2.jpg)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def word_to_index(word):\n",
|
||||
" vec = np.zeros(len(vocabulary))\n",
|
||||
" if word in vocabulary:\n",
|
||||
" idx = vocabulary.index(word)\n",
|
||||
" vec[idx] = 1\n",
|
||||
" else:\n",
|
||||
" vec[-1] = 1\n",
|
||||
" return vec"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def tf(document):\n",
|
||||
" document_vector = None\n",
|
||||
" for word in document:\n",
|
||||
" if document_vector is None:\n",
|
||||
" document_vector = word_to_index(word)\n",
|
||||
" else:\n",
|
||||
" document_vector += word_to_index(word)\n",
|
||||
" return document_vector"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def similarity(query, document):\n",
|
||||
" numerator = np.sum(query * document)\n",
|
||||
" denominator = np.sqrt(np.sum(query*query)) * np.sqrt(np.sum(document*document)) \n",
|
||||
" return numerator / denominator"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"author": "Jakub Pokrywka",
|
||||
"email": "kubapok@wmi.amu.edu.pl",
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"lang": "pl",
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.3"
|
||||
},
|
||||
"subtitle": "3.tfidf (1)[ćwiczenia]",
|
||||
"title": "Ekstrakcja informacji",
|
||||
"year": "2021"
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
515
cw/03b_tfidf_newsgroup.ipynb
Normal file
515
cw/03b_tfidf_newsgroup.ipynb
Normal file
@ -0,0 +1,515 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"![Logo 1](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech1.jpg)\n",
|
||||
"<div class=\"alert alert-block alert-info\">\n",
|
||||
"<h1> Ekstrakcja informacji </h1>\n",
|
||||
"<h2> 3. <i>tfidf (2)</i> [ćwiczenia]</h2> \n",
|
||||
"<h3> Jakub Pokrywka (2021)</h3>\n",
|
||||
"</div>\n",
|
||||
"\n",
|
||||
"![Logo 2](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech2.jpg)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Zajecia 3\n",
|
||||
"\n",
|
||||
"Przydatne materiały:\n",
|
||||
"\n",
|
||||
"https://scikit-learn.org/0.19/datasets/twenty_newsgroups.html\n",
|
||||
"\n",
|
||||
"https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Importy"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import numpy as np\n",
|
||||
"import sklearn.metrics\n",
|
||||
"\n",
|
||||
"from sklearn.datasets import fetch_20newsgroups\n",
|
||||
"\n",
|
||||
"from sklearn.feature_extraction.text import TfidfVectorizer"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Zbiór danych"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"newsgroups = fetch_20newsgroups()['data']"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"11314"
|
||||
]
|
||||
},
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"len(newsgroups)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"From: lerxst@wam.umd.edu (where's my thing)\n",
|
||||
"Subject: WHAT car is this!?\n",
|
||||
"Nntp-Posting-Host: rac3.wam.umd.edu\n",
|
||||
"Organization: University of Maryland, College Park\n",
|
||||
"Lines: 15\n",
|
||||
"\n",
|
||||
" I was wondering if anyone out there could enlighten me on this car I saw\n",
|
||||
"the other day. It was a 2-door sports car, looked to be from the late 60s/\n",
|
||||
"early 70s. It was called a Bricklin. The doors were really small. In addition,\n",
|
||||
"the front bumper was separate from the rest of the body. This is \n",
|
||||
"all I know. If anyone can tellme a model name, engine specs, years\n",
|
||||
"of production, where this car is made, history, or whatever info you\n",
|
||||
"have on this funky looking car, please e-mail.\n",
|
||||
"\n",
|
||||
"Thanks,\n",
|
||||
"- IL\n",
|
||||
" ---- brought to you by your neighborhood Lerxst ----\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(newsgroups[0])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Naiwne przeszukiwanie"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"all_documents = list() \n",
|
||||
"for document in newsgroups:\n",
|
||||
" if 'car' in document:\n",
|
||||
" all_documents.append(document)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"From: lerxst@wam.umd.edu (where's my thing)\n",
|
||||
"Subject: WHAT car is this!?\n",
|
||||
"Nntp-Posting-Host: rac3.wam.umd.edu\n",
|
||||
"Organization: University of Maryland, College Park\n",
|
||||
"Lines: 15\n",
|
||||
"\n",
|
||||
" I was wondering if anyone out there could enlighten me on this car I saw\n",
|
||||
"the other day. It was a 2-door sports car, looked to be from the late 60s/\n",
|
||||
"early 70s. It was called a Bricklin. The doors were really small. In addition,\n",
|
||||
"the front bumper was separate from the rest of the body. This is \n",
|
||||
"all I know. If anyone can tellme a model name, engine specs, years\n",
|
||||
"of production, where this car is made, history, or whatever info you\n",
|
||||
"have on this funky looking car, please e-mail.\n",
|
||||
"\n",
|
||||
"Thanks,\n",
|
||||
"- IL\n",
|
||||
" ---- brought to you by your neighborhood Lerxst ----\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(all_documents[0])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"From: guykuo@carson.u.washington.edu (Guy Kuo)\n",
|
||||
"Subject: SI Clock Poll - Final Call\n",
|
||||
"Summary: Final call for SI clock reports\n",
|
||||
"Keywords: SI,acceleration,clock,upgrade\n",
|
||||
"Article-I.D.: shelley.1qvfo9INNc3s\n",
|
||||
"Organization: University of Washington\n",
|
||||
"Lines: 11\n",
|
||||
"NNTP-Posting-Host: carson.u.washington.edu\n",
|
||||
"\n",
|
||||
"A fair number of brave souls who upgraded their SI clock oscillator have\n",
|
||||
"shared their experiences for this poll. Please send a brief message detailing\n",
|
||||
"your experiences with the procedure. Top speed attained, CPU rated speed,\n",
|
||||
"add on cards and adapters, heat sinks, hour of usage per day, floppy disk\n",
|
||||
"functionality with 800 and 1.4 m floppies are especially requested.\n",
|
||||
"\n",
|
||||
"I will be summarizing in the next two days, so please add to the network\n",
|
||||
"knowledge base if you have done the clock upgrade and haven't answered this\n",
|
||||
"poll. Thanks.\n",
|
||||
"\n",
|
||||
"Guy Kuo <guykuo@u.washington.edu>\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(all_documents[1])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### jakie są problemy z takim podejściem?\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## TFIDF i odległość cosinusowa- gotowe biblioteki"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"vectorizer = TfidfVectorizer()\n",
|
||||
"#vectorizer = TfidfVectorizer(use_idf = False, ngram_range=(1,2))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"document_vectors = vectorizer.fit_transform(newsgroups)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"<11314x130107 sparse matrix of type '<class 'numpy.float64'>'\n",
|
||||
"\twith 1787565 stored elements in Compressed Sparse Row format>"
|
||||
]
|
||||
},
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"document_vectors"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"<1x130107 sparse matrix of type '<class 'numpy.float64'>'\n",
|
||||
"\twith 89 stored elements in Compressed Sparse Row format>"
|
||||
]
|
||||
},
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"document_vectors[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"matrix([[0., 0., 0., ..., 0., 0., 0.]])"
|
||||
]
|
||||
},
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"document_vectors[0].todense()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"matrix([[0., 0., 0., ..., 0., 0., 0.],\n",
|
||||
" [0., 0., 0., ..., 0., 0., 0.],\n",
|
||||
" [0., 0., 0., ..., 0., 0., 0.],\n",
|
||||
" [0., 0., 0., ..., 0., 0., 0.]])"
|
||||
]
|
||||
},
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"document_vectors[0:4].todense()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"query_str = 'speed'\n",
|
||||
"#query_str = 'speed car'\n",
|
||||
"#query_str = 'spider man'"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"query_vector = vectorizer.transform([query_str])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"<11314x130107 sparse matrix of type '<class 'numpy.float64'>'\n",
|
||||
"\twith 1787565 stored elements in Compressed Sparse Row format>"
|
||||
]
|
||||
},
|
||||
"execution_count": 16,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"document_vectors"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 17,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"<1x130107 sparse matrix of type '<class 'numpy.float64'>'\n",
|
||||
"\twith 1 stored elements in Compressed Sparse Row format>"
|
||||
]
|
||||
},
|
||||
"execution_count": 17,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"query_vector"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"similarities = sklearn.metrics.pairwise.cosine_similarity(query_vector,document_vectors)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 19,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"array([0.26949927, 0.3491801 , 0.44292083, 0.47784165])"
|
||||
]
|
||||
},
|
||||
"execution_count": 19,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"np.sort(similarities)[0][-4:]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 20,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"array([4517, 5509, 2116, 9921])"
|
||||
]
|
||||
},
|
||||
"execution_count": 20,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"similarities.argsort()[0][-4:]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"for i in range (1,5):\n",
|
||||
" print(newsgroups[similarities.argsort()[0][-i]])\n",
|
||||
" print(np.sort(similarities)[0,-i])\n",
|
||||
" print('-'*100)\n",
|
||||
" print('-'*100)\n",
|
||||
" print('-'*100)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Zadanie domowe\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"- Wybrać zbiór tekstowy, który ma conajmniej 10000 dokumentów (inny niż w tym przykładzie).\n",
|
||||
"- Na jego podstawie stworzyć wyszukiwarkę bazującą na OKAPI BM25, tzn. system który dla podanej frazy podaje kilka (5-10) posortowanych najbardziej pasujących dokumentów razem ze scorami. Należy wypisywać też ilość zwracanych dokumentów, czyli takich z niezerowym scorem. Można korzystać z gotowych bibliotek do wektoryzacji dokumentów, należy jednak samemu zaimplementować OKAPI BM25. \n",
|
||||
"- Znaleźć frazę (query), dla której wynik nie jest satysfakcjonujący.\n",
|
||||
"- Poprawić wyszukiwarkę (np. poprzez zmianę preprocessingu tekstu, wektoryzer, zmianę parametrów algorytmu rankującego lub sam algorytm) tak, żeby zwracała satysfakcjonujące wyniki dla poprzedniej frazy. Należy zrobić inną zmianę niż w tym przykładzie, tylko wymyślić coś własnego.\n",
|
||||
"- prezentować pracę na zajęciach (06.04) odpowiadając na pytania:\n",
|
||||
" - jak wygląda zbiór i system wyszukiwania przed zmianami\n",
|
||||
" - dla jakiej frazy wyniki są niesatysfakcjonujące (pokazać wyniki)\n",
|
||||
" - jakie zmiany zostały naniesione\n",
|
||||
" - jak wyglądają wyniki wyszukiwania po zmianach\n",
|
||||
" - jak zmiany wpłynęły na wyniki (1-2 zdania)\n",
|
||||
" \n",
|
||||
"Prezentacja powinna być maksymalnie prosta i trwać maksymalnie 2-3 minuty.\n",
|
||||
"punktów do zdobycia: 70\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"author": "Jakub Pokrywka",
|
||||
"email": "kubapok@wmi.amu.edu.pl",
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"lang": "pl",
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.3"
|
||||
},
|
||||
"subtitle": "3.tfidf (2)[ćwiczenia]",
|
||||
"title": "Ekstrakcja informacji",
|
||||
"year": "2021"
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
Loading…
Reference in New Issue
Block a user