Merge pull request '2nd meeting' (#2) from 2nd_meeting into master

Reviewed-on: robkw/introduction_to_recommender_systems#2
This commit is contained in:
Robert Kwieciński 2021-04-16 22:50:01 +02:00
commit 7838e0c156
6 changed files with 2997 additions and 563 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

Binary file not shown.

View File

@ -0,0 +1,96 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['dimensions: 1, cases when observation is the nearest: 0.0%',\n",
" 'dimensions: 2, cases when observation is the nearest: 0.0%',\n",
" 'dimensions: 3, cases when observation is the nearest: 0.0%',\n",
" 'dimensions: 10, cases when observation is the nearest: 13.0%',\n",
" 'dimensions: 20, cases when observation is the nearest: 61.0%',\n",
" 'dimensions: 30, cases when observation is the nearest: 96.0%',\n",
" 'dimensions: 40, cases when observation is the nearest: 98.0%',\n",
" 'dimensions: 50, cases when observation is the nearest: 100.0%',\n",
" 'dimensions: 60, cases when observation is the nearest: 100.0%',\n",
" 'dimensions: 70, cases when observation is the nearest: 100.0%',\n",
" 'dimensions: 80, cases when observation is the nearest: 100.0%',\n",
" 'dimensions: 90, cases when observation is the nearest: 100.0%']"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import random\n",
"from numpy.linalg import norm\n",
"\n",
"dimensions = [1, 2, 3] + [10 * i for i in range(1, 10)]\n",
"nb_vectors = 10000\n",
"trials = 100\n",
"k = 1 # by setting k=1 we want to check how often the closest vector to the avarage of 2 random vectors is one of these 2 vectors\n",
"\n",
"result = []\n",
"for dimension in dimensions:\n",
" vectors = np.random.normal(0, 1, size=(nb_vectors, dimension))\n",
" successes = 0\n",
" for i in range(trials):\n",
" i1, i2 = random.sample(range(nb_vectors), 2)\n",
" target = (vectors[i1] + vectors[i2]) / 2\n",
"\n",
" distances = pd.DataFrame(\n",
" enumerate(\n",
" np.dot(target, vectors.transpose())\n",
" / norm(target)\n",
" / norm(vectors.transpose(), axis=0)\n",
" )\n",
" )\n",
" distances = distances.sort_values(by=[1], ascending=False)\n",
" if (i1 in (list(distances[0][:k]))) | (i2 in (list(distances[0][:k]))):\n",
" successes += 1\n",
" result.append(successes / trials)\n",
"\n",
"[\n",
" f\"dimensions: {i}, cases when observation is the nearest: {100*round(j,3)}%\"\n",
" for i, j in zip(dimensions, result)\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

File diff suppressed because one or more lines are too long

Binary file not shown.