{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "zPOfPO5LAOqy", "outputId": "a8846a75-ef0a-4048-8168-f71d79d7b7e8" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", "Requirement already satisfied: kaggle in /usr/local/lib/python3.9/dist-packages (1.5.13)\n", "Requirement already satisfied: requests in /usr/local/lib/python3.9/dist-packages (from kaggle) (2.27.1)\n", "Requirement already satisfied: python-slugify in /usr/local/lib/python3.9/dist-packages (from kaggle) (8.0.1)\n", "Requirement already satisfied: six>=1.10 in /usr/local/lib/python3.9/dist-packages (from kaggle) (1.16.0)\n", "Requirement already satisfied: tqdm in /usr/local/lib/python3.9/dist-packages (from kaggle) (4.65.0)\n", "Requirement already satisfied: certifi in /usr/local/lib/python3.9/dist-packages (from kaggle) (2022.12.7)\n", "Requirement already satisfied: python-dateutil in /usr/local/lib/python3.9/dist-packages (from kaggle) (2.8.2)\n", "Requirement already satisfied: urllib3 in /usr/local/lib/python3.9/dist-packages (from kaggle) (1.26.15)\n", "Requirement already satisfied: text-unidecode>=1.3 in /usr/local/lib/python3.9/dist-packages (from python-slugify->kaggle) (1.3)\n", "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.9/dist-packages (from requests->kaggle) (3.4)\n", "Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.9/dist-packages (from requests->kaggle) (2.0.12)\n", "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", "Requirement already satisfied: pandas in /usr/local/lib/python3.9/dist-packages (1.4.4)\n", "Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.9/dist-packages (from pandas) (2.8.2)\n", "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.9/dist-packages (from pandas) (2022.7.1)\n", "Requirement already satisfied: numpy>=1.18.5 in /usr/local/lib/python3.9/dist-packages (from pandas) (1.22.4)\n", "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.9/dist-packages (from python-dateutil>=2.8.1->pandas) (1.16.0)\n" ] } ], "source": [ "!pip install --user kaggle\n", "!pip install --user pandas" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "gc7VHACRAOq0", "outputId": "20220fe9-e872-451b-f759-b4cfff91bc51" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Traceback (most recent call last):\n", " File \"/usr/local/bin/kaggle\", line 5, in \n", " from kaggle.cli import main\n", " File \"/usr/local/lib/python3.9/dist-packages/kaggle/__init__.py\", line 23, in \n", " api.authenticate()\n", " File \"/usr/local/lib/python3.9/dist-packages/kaggle/api/kaggle_api_extended.py\", line 164, in authenticate\n", " raise IOError('Could not find {}. Make sure it\\'s located in'\n", "OSError: Could not find kaggle.json. Make sure it's located in /root/.kaggle. Or use the environment method.\n" ] } ], "source": [ "!kaggle datasets download -d dylanjcastillo/7k-books-with-metadata" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "id": "utslvpN1AOq0", "outputId": "dda342a0-18dc-40a7-86bd-b233844c1231", "colab": { "base_uri": "https://localhost:8080/" } }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Archive: 7k-books-with-metadata.zip\n", " inflating: books.csv \n" ] } ], "source": [ "!unzip -o 7k-books-with-metadata.zip" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "id": "k9Q3DwbiAOq0", "outputId": "ab0a4f14-188b-41d6-c3fe-0553d80aa648", "colab": { "base_uri": "https://localhost:8080/", "height": 676 } }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " isbn13 isbn10 title \\\n", "0 9780002005883 0002005883 Gilead \n", "1 9780002261982 0002261987 Spider's Web \n", "2 9780006163831 0006163831 The One Tree \n", "3 9780006178736 0006178731 Rage of angels \n", "4 9780006280897 0006280897 The Four Loves \n", "... ... ... ... \n", "6805 9788185300535 8185300534 I Am that \n", "6806 9788185944609 8185944601 Secrets Of The Heart \n", "6807 9788445074879 8445074873 Fahrenheit 451 \n", "6808 9789027712059 9027712050 The Berlin Phenomenology \n", "6809 9789042003408 9042003405 'I'm Telling You Stories' \n", "\n", " subtitle \\\n", "0 NaN \n", "1 A Novel \n", "2 NaN \n", "3 NaN \n", "4 NaN \n", "... ... \n", "6805 Talks with Sri Nisargadatta Maharaj \n", "6806 NaN \n", "6807 NaN \n", "6808 NaN \n", "6809 Jeanette Winterson and the Politics of Reading \n", "\n", " authors \\\n", "0 Marilynne Robinson \n", "1 Charles Osborne;Agatha Christie \n", "2 Stephen R. Donaldson \n", "3 Sidney Sheldon \n", "4 Clive Staples Lewis \n", "... ... \n", "6805 Sri Nisargadatta Maharaj;Sudhakar S. Dikshit \n", "6806 Khalil Gibran \n", "6807 Ray Bradbury \n", "6808 Georg Wilhelm Friedrich Hegel \n", "6809 Helena Grice;Tim Woods \n", "\n", " categories \\\n", "0 Fiction \n", "1 Detective and mystery stories \n", "2 American fiction \n", "3 Fiction \n", "4 Christian life \n", "... ... \n", "6805 Philosophy \n", "6806 Mysticism \n", "6807 Book burning \n", "6808 History \n", "6809 Literary Criticism \n", "\n", " thumbnail \\\n", "0 http://books.google.com/books/content?id=KQZCP... \n", "1 http://books.google.com/books/content?id=gA5GP... \n", "2 http://books.google.com/books/content?id=OmQaw... \n", "3 http://books.google.com/books/content?id=FKo2T... \n", "4 http://books.google.com/books/content?id=XhQ5X... \n", "... ... \n", "6805 http://books.google.com/books/content?id=Fv_JP... \n", "6806 http://books.google.com/books/content?id=XcrVp... \n", "6807 NaN \n", "6808 http://books.google.com/books/content?id=Vy7Sk... \n", "6809 http://books.google.com/books/content?id=2lVyR... \n", "\n", " description published_year \\\n", "0 A NOVEL THAT READERS and critics have been eag... 2004.0 \n", "1 A new 'Christie for Christmas' -- a full-lengt... 2000.0 \n", "2 Volume Two of Stephen Donaldson's acclaimed se... 1982.0 \n", "3 A memorable, mesmerizing heroine Jennifer -- b... 1993.0 \n", "4 Lewis' work on the nature of love divides love... 2002.0 \n", "... ... ... \n", "6805 This collection of the timeless teachings of o... 1999.0 \n", "6806 NaN 1993.0 \n", "6807 NaN 2004.0 \n", "6808 Since the three volume edition ofHegel's Philo... 1981.0 \n", "6809 This is a jubilant and rewarding collection of... 1998.0 \n", "\n", " average_rating num_pages ratings_count \n", "0 3.85 247.0 361.0 \n", "1 3.83 241.0 5164.0 \n", "2 3.97 479.0 172.0 \n", "3 3.93 512.0 29532.0 \n", "4 4.15 170.0 33684.0 \n", "... ... ... ... \n", "6805 4.51 531.0 104.0 \n", "6806 4.08 74.0 324.0 \n", "6807 3.98 186.0 5733.0 \n", "6808 0.00 210.0 0.0 \n", "6809 3.70 136.0 10.0 \n", "\n", "[6810 rows x 12 columns]" ], "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
isbn13isbn10titlesubtitleauthorscategoriesthumbnaildescriptionpublished_yearaverage_ratingnum_pagesratings_count
097800020058830002005883GileadNaNMarilynne RobinsonFictionhttp://books.google.com/books/content?id=KQZCP...A NOVEL THAT READERS and critics have been eag...2004.03.85247.0361.0
197800022619820002261987Spider's WebA NovelCharles Osborne;Agatha ChristieDetective and mystery storieshttp://books.google.com/books/content?id=gA5GP...A new 'Christie for Christmas' -- a full-lengt...2000.03.83241.05164.0
297800061638310006163831The One TreeNaNStephen R. DonaldsonAmerican fictionhttp://books.google.com/books/content?id=OmQaw...Volume Two of Stephen Donaldson's acclaimed se...1982.03.97479.0172.0
397800061787360006178731Rage of angelsNaNSidney SheldonFictionhttp://books.google.com/books/content?id=FKo2T...A memorable, mesmerizing heroine Jennifer -- b...1993.03.93512.029532.0
497800062808970006280897The Four LovesNaNClive Staples LewisChristian lifehttp://books.google.com/books/content?id=XhQ5X...Lewis' work on the nature of love divides love...2002.04.15170.033684.0
.......................................
680597881853005358185300534I Am thatTalks with Sri Nisargadatta MaharajSri Nisargadatta Maharaj;Sudhakar S. DikshitPhilosophyhttp://books.google.com/books/content?id=Fv_JP...This collection of the timeless teachings of o...1999.04.51531.0104.0
680697881859446098185944601Secrets Of The HeartNaNKhalil GibranMysticismhttp://books.google.com/books/content?id=XcrVp...NaN1993.04.0874.0324.0
680797884450748798445074873Fahrenheit 451NaNRay BradburyBook burningNaNNaN2004.03.98186.05733.0
680897890277120599027712050The Berlin PhenomenologyNaNGeorg Wilhelm Friedrich HegelHistoryhttp://books.google.com/books/content?id=Vy7Sk...Since the three volume edition ofHegel's Philo...1981.00.00210.00.0
680997890420034089042003405'I'm Telling You Stories'Jeanette Winterson and the Politics of ReadingHelena Grice;Tim WoodsLiterary Criticismhttp://books.google.com/books/content?id=2lVyR...This is a jubilant and rewarding collection of...1998.03.70136.010.0
\n", "

6810 rows × 12 columns

\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ] }, "metadata": {}, "execution_count": 16 } ], "source": [ "import pandas as pd\n", "books=pd.read_csv('books.csv')\n", "books" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true, "id": "WgVroQDTAOq1", "outputId": "932fdfce-1d65-4290-cc5d-cc053f4fa459" }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
isbn13isbn10titlesubtitleauthorscategoriesthumbnaildescriptionpublished_yearaverage_ratingnum_pagesratings_count
count6.810000e+0368106810238167386711648165486804.0000006767.0000006767.0000006.767000e+03
uniqueNaN681063982009378056764816474NaNNaNNaNNaN
topNaN0786282258The Lord of the RingsA NovelAgatha ChristieFictionhttp://books.google.com/books/content?id=6dVAW...This is a reproduction of the original artefac...NaNNaNNaNNaN
freqNaN11122637258816NaNNaNNaNNaN
mean9.780677e+12NaNNaNNaNNaNNaNNaNNaN1998.6303643.933284348.1810262.106910e+04
std6.068911e+08NaNNaNNaNNaNNaNNaNNaN10.4842570.331352242.3767831.376207e+05
min9.780002e+12NaNNaNNaNNaNNaNNaNNaN1853.0000000.0000000.0000000.000000e+00
25%9.780330e+12NaNNaNNaNNaNNaNNaNNaN1996.0000003.770000208.0000001.590000e+02
50%9.780553e+12NaNNaNNaNNaNNaNNaNNaN2002.0000003.960000304.0000001.018000e+03
75%9.780810e+12NaNNaNNaNNaNNaNNaNNaN2005.0000004.130000420.0000005.992500e+03
max9.789042e+12NaNNaNNaNNaNNaNNaNNaN2019.0000005.0000003342.0000005.629932e+06
\n", "
" ], "text/plain": [ " isbn13 isbn10 title subtitle \\\n", "count 6.810000e+03 6810 6810 2381 \n", "unique NaN 6810 6398 2009 \n", "top NaN 0786282258 The Lord of the Rings A Novel \n", "freq NaN 1 11 226 \n", "mean 9.780677e+12 NaN NaN NaN \n", "std 6.068911e+08 NaN NaN NaN \n", "min 9.780002e+12 NaN NaN NaN \n", "25% 9.780330e+12 NaN NaN NaN \n", "50% 9.780553e+12 NaN NaN NaN \n", "75% 9.780810e+12 NaN NaN NaN \n", "max 9.789042e+12 NaN NaN NaN \n", "\n", " authors categories \\\n", "count 6738 6711 \n", "unique 3780 567 \n", "top Agatha Christie Fiction \n", "freq 37 2588 \n", "mean NaN NaN \n", "std NaN NaN \n", "min NaN NaN \n", "25% NaN NaN \n", "50% NaN NaN \n", "75% NaN NaN \n", "max NaN NaN \n", "\n", " thumbnail \\\n", "count 6481 \n", "unique 6481 \n", "top http://books.google.com/books/content?id=6dVAW... \n", "freq 1 \n", "mean NaN \n", "std NaN \n", "min NaN \n", "25% NaN \n", "50% NaN \n", "75% NaN \n", "max NaN \n", "\n", " description published_year \\\n", "count 6548 6804.000000 \n", "unique 6474 NaN \n", "top This is a reproduction of the original artefac... NaN \n", "freq 6 NaN \n", "mean NaN 1998.630364 \n", "std NaN 10.484257 \n", "min NaN 1853.000000 \n", "25% NaN 1996.000000 \n", "50% NaN 2002.000000 \n", "75% NaN 2005.000000 \n", "max NaN 2019.000000 \n", "\n", " average_rating num_pages ratings_count \n", "count 6767.000000 6767.000000 6.767000e+03 \n", "unique NaN NaN NaN \n", "top NaN NaN NaN \n", "freq NaN NaN NaN \n", "mean 3.933284 348.181026 2.106910e+04 \n", "std 0.331352 242.376783 1.376207e+05 \n", "min 0.000000 0.000000 0.000000e+00 \n", "25% 3.770000 208.000000 1.590000e+02 \n", "50% 3.960000 304.000000 1.018000e+03 \n", "75% 4.130000 420.000000 5.992500e+03 \n", "max 5.000000 3342.000000 5.629932e+06 " ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "books.describe(include='all')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "1hwHH65hAOq1", "outputId": "0b3e32ab-230b-4d9d-db8a-d2d25e57161b" }, "outputs": [ { "data": { "text/plain": [ "isbn13 0\n", "isbn10 0\n", "title 0\n", "subtitle 4429\n", "authors 72\n", "categories 99\n", "thumbnail 329\n", "description 262\n", "published_year 6\n", "average_rating 43\n", "num_pages 43\n", "ratings_count 43\n", "dtype: int64" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "books.isnull().sum()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "mZMFUt2pAOq1" }, "outputs": [], "source": [ "books.drop('thumbnail', inplace=True, axis=1)\n", "books.drop('subtitle', inplace=True, axis=1)\n", "books.drop('description', inplace=True, axis=1)\n", "books.drop('isbn13', inplace=True, axis=1)\n", "books.drop('isbn10', inplace=True, axis=1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "y6I2PKuhAOq1", "outputId": "2e03efc3-e8e3-4cc8-abb8-97e0594665be" }, "outputs": [ { "data": { "text/plain": [ "title 0\n", "authors 72\n", "categories 99\n", "published_year 6\n", "average_rating 43\n", "num_pages 43\n", "ratings_count 43\n", "dtype: int64" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "books.isnull().sum()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "21R7h40lAOq1", "outputId": "4c9f746b-4347-4cd3-cdae-21e7bc818f2c" }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
titleauthorscategoriespublished_yearaverage_ratingnum_pagesratings_count
0GileadMarilynne RobinsonFiction2004.03.85247.0361.0
1Spider's WebCharles Osborne;Agatha ChristieDetective and mystery stories2000.03.83241.05164.0
2The One TreeStephen R. DonaldsonAmerican fiction1982.03.97479.0172.0
3Rage of angelsSidney SheldonFiction1993.03.93512.029532.0
4The Four LovesClive Staples LewisChristian life2002.04.15170.033684.0
........................
6805I Am thatSri Nisargadatta Maharaj;Sudhakar S. DikshitPhilosophy1999.04.51531.0104.0
6806Secrets Of The HeartKhalil GibranMysticism1993.04.0874.0324.0
6807Fahrenheit 451Ray BradburyBook burning2004.03.98186.05733.0
6808The Berlin PhenomenologyGeorg Wilhelm Friedrich HegelHistory1981.00.00210.00.0
6809'I'm Telling You Stories'Helena Grice;Tim WoodsLiterary Criticism1998.03.70136.010.0
\n", "

6599 rows × 7 columns

\n", "
" ], "text/plain": [ " title authors \\\n", "0 Gilead Marilynne Robinson \n", "1 Spider's Web Charles Osborne;Agatha Christie \n", "2 The One Tree Stephen R. Donaldson \n", "3 Rage of angels Sidney Sheldon \n", "4 The Four Loves Clive Staples Lewis \n", "... ... ... \n", "6805 I Am that Sri Nisargadatta Maharaj;Sudhakar S. Dikshit \n", "6806 Secrets Of The Heart Khalil Gibran \n", "6807 Fahrenheit 451 Ray Bradbury \n", "6808 The Berlin Phenomenology Georg Wilhelm Friedrich Hegel \n", "6809 'I'm Telling You Stories' Helena Grice;Tim Woods \n", "\n", " categories published_year average_rating \\\n", "0 Fiction 2004.0 3.85 \n", "1 Detective and mystery stories 2000.0 3.83 \n", "2 American fiction 1982.0 3.97 \n", "3 Fiction 1993.0 3.93 \n", "4 Christian life 2002.0 4.15 \n", "... ... ... ... \n", "6805 Philosophy 1999.0 4.51 \n", "6806 Mysticism 1993.0 4.08 \n", "6807 Book burning 2004.0 3.98 \n", "6808 History 1981.0 0.00 \n", "6809 Literary Criticism 1998.0 3.70 \n", "\n", " num_pages ratings_count \n", "0 247.0 361.0 \n", "1 241.0 5164.0 \n", "2 479.0 172.0 \n", "3 512.0 29532.0 \n", "4 170.0 33684.0 \n", "... ... ... \n", "6805 531.0 104.0 \n", "6806 74.0 324.0 \n", "6807 186.0 5733.0 \n", "6808 210.0 0.0 \n", "6809 136.0 10.0 \n", "\n", "[6599 rows x 7 columns]" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "books.dropna(inplace=True)\n", "books" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "lx9gqh7UAOq2", "outputId": "651a374e-eb8c-426f-faa0-c1cd6c9762bb" }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
titleauthorscategoriespublished_yearaverage_ratingnum_pagesratings_count
count6599659965996599.0000006599.0000006599.0000006.599000e+03
unique62163728563NaNNaNNaNNaN
topThe Lord of the RingsAgatha ChristieFictionNaNNaNNaNNaN
freq9372561NaNNaNNaNNaN
meanNaNNaNNaN1998.7504173.931367348.2968632.143083e+04
stdNaNNaNNaN10.1684650.331173239.1994111.392929e+05
minNaNNaNNaN1876.0000000.0000000.0000000.000000e+00
25%NaNNaNNaN1997.0000003.770000208.0000001.630000e+02
50%NaNNaNNaN2002.0000003.950000304.0000001.032000e+03
75%NaNNaNNaN2005.0000004.130000420.0000006.105500e+03
maxNaNNaNNaN2019.0000005.0000003342.0000005.629932e+06
\n", "
" ], "text/plain": [ " title authors categories published_year \\\n", "count 6599 6599 6599 6599.000000 \n", "unique 6216 3728 563 NaN \n", "top The Lord of the Rings Agatha Christie Fiction NaN \n", "freq 9 37 2561 NaN \n", "mean NaN NaN NaN 1998.750417 \n", "std NaN NaN NaN 10.168465 \n", "min NaN NaN NaN 1876.000000 \n", "25% NaN NaN NaN 1997.000000 \n", "50% NaN NaN NaN 2002.000000 \n", "75% NaN NaN NaN 2005.000000 \n", "max NaN NaN NaN 2019.000000 \n", "\n", " average_rating num_pages ratings_count \n", "count 6599.000000 6599.000000 6.599000e+03 \n", "unique NaN NaN NaN \n", "top NaN NaN NaN \n", "freq NaN NaN NaN \n", "mean 3.931367 348.296863 2.143083e+04 \n", "std 0.331173 239.199411 1.392929e+05 \n", "min 0.000000 0.000000 0.000000e+00 \n", "25% 3.770000 208.000000 1.630000e+02 \n", "50% 3.950000 304.000000 1.032000e+03 \n", "75% 4.130000 420.000000 6.105500e+03 \n", "max 5.000000 3342.000000 5.629932e+06 " ] }, "execution_count": 47, "metadata": {}, "output_type": "execute_result" } ], "source": [ "books.describe(include='all')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "J7DUOhOwAOq2", "outputId": "3bce3396-8f22-41a4-ebfb-9895ad2bb73c" }, "outputs": [ { "data": { "text/plain": [ "Fiction 2561\n", "Juvenile Fiction 524\n", "Biography & Autobiography 398\n", "History 261\n", "Literary Criticism 165\n", " ... \n", "Child analysis 1\n", "Illinois 1\n", "Erinyes (Greek mythology) 1\n", "Exorcism 1\n", "People with social disabilities 1\n", "Name: categories, Length: 563, dtype: int64" ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "books[\"categories\"].value_counts()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "4R3GDLXgAOq2", "outputId": "4d3a9d8a-f37d-4cba-ebbb-0615571396f4" }, "outputs": [ { "data": { "text/plain": [ "2006.0 877\n", "2005.0 681\n", "2004.0 605\n", "2003.0 569\n", "2002.0 470\n", " ... \n", "1928.0 1\n", "1904.0 1\n", "1938.0 1\n", "1936.0 1\n", "1947.0 1\n", "Name: published_year, Length: 91, dtype: int64" ] }, "execution_count": 49, "metadata": {}, "output_type": "execute_result" } ], "source": [ "books[\"published_year\"].value_counts()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "YSLMCB4nAOq2", "outputId": "ccdb49cc-9037-4995-9d0b-c0a749f6eae1" }, "outputs": [ { "data": { "text/plain": [ "Agatha Christie 37\n", "Stephen King 36\n", "William Shakespeare 29\n", "John Ronald Reuel Tolkien 25\n", "Sandra Brown 23\n", " ..\n", "Aeg 1\n", "Pauline Reage 1\n", "Tim Flannery 1\n", "Saint Augustine (of Hippo) 1\n", "Michael S. Reynolds 1\n", "Name: authors, Length: 3728, dtype: int64" ] }, "execution_count": 50, "metadata": {}, "output_type": "execute_result" } ], "source": [ "books[\"authors\"].value_counts()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "D8HrFKIGAOq3", "outputId": "20a73c84-1b66-4dd8-fa99-caba6ca68b29" }, "outputs": [ { "data": { "text/plain": [ "4.00 125\n", "3.93 110\n", "3.95 109\n", "3.99 108\n", "3.96 104\n", " ... \n", "4.64 1\n", "4.68 1\n", "4.72 1\n", "2.44 1\n", "4.78 1\n", "Name: average_rating, Length: 200, dtype: int64" ] }, "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ "books[\"average_rating\"].value_counts()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "utiDxb60AOq3" }, "outputs": [], "source": [ "import sklearn\n", "from sklearn.model_selection import train_test_split\n", "\n", "books_train, books_test = sklearn.model_selection.train_test_split(books, test_size=0.2, random_state=1)\n", "books_train, books_val = sklearn.model_selection.train_test_split(books_train, test_size=0.5, random_state=1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "rS0epPE6AOq3", "outputId": "f704dda5-95e7-474b-a9b3-d8e107067710" }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
titleauthorscategoriespublished_yearaverage_ratingnum_pagesratings_count
915The Autobiography of Alice B. ToklasGertrude SteinBiography & Autobiography2001.03.59272.0233.0
4493Never Far from NowhereAndrea LevyBlacks1996.03.68282.0601.0
1983Year's Happy EndingBetty NeelsFiction2001.03.95216.0128.0
2196Wrinkles in TimeGeorge Smoot;Keay DavidsonScience1994.03.99360.0985.0
4011DispatchesMichael HerrHistory1991.04.23260.012590.0
........................
2841Magic BitesIlona AndrewsFiction2007.04.07260.082231.0
1713High FiveJanet EvanovichBail bond agents2000.04.18336.099172.0
3469A Brief History of TimeStephen HawkingScience1998.04.16212.0214520.0
1657The MagusJohn FowlesFiction2001.04.05656.036909.0
3986The Complete Monty Python's Flying CircusGraham Chapman;Monty Python (Comedy troupe);Te...Humor1989.04.44384.01191.0
\n", "

2639 rows × 7 columns

\n", "
" ], "text/plain": [ " title \\\n", "915 The Autobiography of Alice B. Toklas \n", "4493 Never Far from Nowhere \n", "1983 Year's Happy Ending \n", "2196 Wrinkles in Time \n", "4011 Dispatches \n", "... ... \n", "2841 Magic Bites \n", "1713 High Five \n", "3469 A Brief History of Time \n", "1657 The Magus \n", "3986 The Complete Monty Python's Flying Circus \n", "\n", " authors \\\n", "915 Gertrude Stein \n", "4493 Andrea Levy \n", "1983 Betty Neels \n", "2196 George Smoot;Keay Davidson \n", "4011 Michael Herr \n", "... ... \n", "2841 Ilona Andrews \n", "1713 Janet Evanovich \n", "3469 Stephen Hawking \n", "1657 John Fowles \n", "3986 Graham Chapman;Monty Python (Comedy troupe);Te... \n", "\n", " categories published_year average_rating num_pages \\\n", "915 Biography & Autobiography 2001.0 3.59 272.0 \n", "4493 Blacks 1996.0 3.68 282.0 \n", "1983 Fiction 2001.0 3.95 216.0 \n", "2196 Science 1994.0 3.99 360.0 \n", "4011 History 1991.0 4.23 260.0 \n", "... ... ... ... ... \n", "2841 Fiction 2007.0 4.07 260.0 \n", "1713 Bail bond agents 2000.0 4.18 336.0 \n", "3469 Science 1998.0 4.16 212.0 \n", "1657 Fiction 2001.0 4.05 656.0 \n", "3986 Humor 1989.0 4.44 384.0 \n", "\n", " ratings_count \n", "915 233.0 \n", "4493 601.0 \n", "1983 128.0 \n", "2196 985.0 \n", "4011 12590.0 \n", "... ... \n", "2841 82231.0 \n", "1713 99172.0 \n", "3469 214520.0 \n", "1657 36909.0 \n", "3986 1191.0 \n", "\n", "[2639 rows x 7 columns]" ] }, "execution_count": 55, "metadata": {}, "output_type": "execute_result" } ], "source": [ "books_train" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "oUWEVGaGAOq3", "outputId": "6a053600-98a9-4990-ae44-cb8eeda97293" }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
titleauthorscategoriespublished_yearaverage_ratingnum_pagesratings_count
count2639263926392639.0000002639.0000002639.0000002.639000e+03
unique25471827286NaNNaNNaNNaN
topOne Hundred Years of SolitudeStephen KingFictionNaNNaNNaNNaN
freq4181027NaNNaNNaNNaN
meanNaNNaNNaN1999.0329673.929807349.5346722.363199e+04
stdNaNNaNNaN9.8653200.358919244.8710901.452470e+05
minNaNNaNNaN1876.0000000.0000000.0000000.000000e+00
25%NaNNaNNaN1997.0000003.770000208.0000001.745000e+02
50%NaNNaNNaN2002.0000003.950000304.0000001.066000e+03
75%NaNNaNNaN2005.0000004.130000429.0000006.084500e+03
maxNaNNaNNaN2019.0000005.0000003020.0000004.367341e+06
\n", "
" ], "text/plain": [ " title authors categories \\\n", "count 2639 2639 2639 \n", "unique 2547 1827 286 \n", "top One Hundred Years of Solitude Stephen King Fiction \n", "freq 4 18 1027 \n", "mean NaN NaN NaN \n", "std NaN NaN NaN \n", "min NaN NaN NaN \n", "25% NaN NaN NaN \n", "50% NaN NaN NaN \n", "75% NaN NaN NaN \n", "max NaN NaN NaN \n", "\n", " published_year average_rating num_pages ratings_count \n", "count 2639.000000 2639.000000 2639.000000 2.639000e+03 \n", "unique NaN NaN NaN NaN \n", "top NaN NaN NaN NaN \n", "freq NaN NaN NaN NaN \n", "mean 1999.032967 3.929807 349.534672 2.363199e+04 \n", "std 9.865320 0.358919 244.871090 1.452470e+05 \n", "min 1876.000000 0.000000 0.000000 0.000000e+00 \n", "25% 1997.000000 3.770000 208.000000 1.745000e+02 \n", "50% 2002.000000 3.950000 304.000000 1.066000e+03 \n", "75% 2005.000000 4.130000 429.000000 6.084500e+03 \n", "max 2019.000000 5.000000 3020.000000 4.367341e+06 " ] }, "execution_count": 56, "metadata": {}, "output_type": "execute_result" } ], "source": [ "books_train.describe(include='all')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "yXkOfB9bAOq3", "outputId": "3fc9e96e-8fe0-490c-d6b5-71b21277aa0a" }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
titleauthorscategoriespublished_yearaverage_ratingnum_pagesratings_count
count1320132013201320.0000001320.0000001320.0000001.320000e+03
unique13031064185NaNNaNNaNNaN
top20,000 Leagues Under the SeaStephen KingFictionNaNNaNNaNNaN
freq37540NaNNaNNaNNaN
meanNaNNaNNaN1998.5909093.925470339.3469701.588767e+04
stdNaNNaNNaN10.1195690.299805219.5609647.877064e+04
minNaNNaNNaN1942.0000002.3300000.0000000.000000e+00
25%NaNNaNNaN1996.0000003.750000208.0000001.510000e+02
50%NaNNaNNaN2002.0000003.950000304.0000001.068000e+03
75%NaNNaNNaN2005.0000004.130000401.0000006.360000e+03
maxNaNNaNNaN2017.0000005.0000003342.0000002.115562e+06
\n", "
" ], "text/plain": [ " title authors categories published_year \\\n", "count 1320 1320 1320 1320.000000 \n", "unique 1303 1064 185 NaN \n", "top 20,000 Leagues Under the Sea Stephen King Fiction NaN \n", "freq 3 7 540 NaN \n", "mean NaN NaN NaN 1998.590909 \n", "std NaN NaN NaN 10.119569 \n", "min NaN NaN NaN 1942.000000 \n", "25% NaN NaN NaN 1996.000000 \n", "50% NaN NaN NaN 2002.000000 \n", "75% NaN NaN NaN 2005.000000 \n", "max NaN NaN NaN 2017.000000 \n", "\n", " average_rating num_pages ratings_count \n", "count 1320.000000 1320.000000 1.320000e+03 \n", "unique NaN NaN NaN \n", "top NaN NaN NaN \n", "freq NaN NaN NaN \n", "mean 3.925470 339.346970 1.588767e+04 \n", "std 0.299805 219.560964 7.877064e+04 \n", "min 2.330000 0.000000 0.000000e+00 \n", "25% 3.750000 208.000000 1.510000e+02 \n", "50% 3.950000 304.000000 1.068000e+03 \n", "75% 4.130000 401.000000 6.360000e+03 \n", "max 5.000000 3342.000000 2.115562e+06 " ] }, "execution_count": 57, "metadata": {}, "output_type": "execute_result" } ], "source": [ "books_test.describe(include='all')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "CWG6q0ixAOq4", "outputId": "367a1088-975b-4da2-e333-50152a4fcbc3" }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
titleauthorscategoriespublished_yearaverage_ratingnum_pagesratings_count
count2640264026402640.0000002640.0000002640.0000002.640000e+03
unique25621850313NaNNaNNaNNaN
topThree Complete NovelsAgatha ChristieFictionNaNNaNNaNNaN
freq614994NaNNaNNaNNaN
meanNaNNaNNaN1998.5477273.935875351.5344702.200209e+04
stdNaNNaNNaN10.4837520.316971242.8294631.558830e+05
minNaNNaNNaN1901.0000000.0000004.0000000.000000e+00
25%NaNNaNNaN1996.0000003.770000208.0000001.557500e+02
50%NaNNaNNaN2002.0000003.950000309.5000009.555000e+02
75%NaNNaNNaN2005.0000004.130000430.2500005.980750e+03
maxNaNNaNNaN2019.0000005.0000002965.0000005.629932e+06
\n", "
" ], "text/plain": [ " title authors categories published_year \\\n", "count 2640 2640 2640 2640.000000 \n", "unique 2562 1850 313 NaN \n", "top Three Complete Novels Agatha Christie Fiction NaN \n", "freq 6 14 994 NaN \n", "mean NaN NaN NaN 1998.547727 \n", "std NaN NaN NaN 10.483752 \n", "min NaN NaN NaN 1901.000000 \n", "25% NaN NaN NaN 1996.000000 \n", "50% NaN NaN NaN 2002.000000 \n", "75% NaN NaN NaN 2005.000000 \n", "max NaN NaN NaN 2019.000000 \n", "\n", " average_rating num_pages ratings_count \n", "count 2640.000000 2640.000000 2.640000e+03 \n", "unique NaN NaN NaN \n", "top NaN NaN NaN \n", "freq NaN NaN NaN \n", "mean 3.935875 351.534470 2.200209e+04 \n", "std 0.316971 242.829463 1.558830e+05 \n", "min 0.000000 4.000000 0.000000e+00 \n", "25% 3.770000 208.000000 1.557500e+02 \n", "50% 3.950000 309.500000 9.555000e+02 \n", "75% 4.130000 430.250000 5.980750e+03 \n", "max 5.000000 2965.000000 5.629932e+06 " ] }, "execution_count": 58, "metadata": {}, "output_type": "execute_result" } ], "source": [ "books_val.describe(include='all')" ] } ], "metadata": { "author": "Tomasz Ziętkiewicz", "celltoolbar": "Slideshow", "email": "tomasz.zietkiewicz@amu.edu.pl", "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "lang": "pl", "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.10" }, "slideshow": { "slide_type": "slide" }, "subtitle": "2.Dane[laboratoria]", "title": "Inżynieria uczenia maszynowego", "toc": { "base_numbering": 1, "nav_menu": {}, "number_sections": false, "sideBar": false, "skip_h1_title": false, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": false, "toc_position": {}, "toc_section_display": false, "toc_window_display": false }, "year": "2021", "colab": { "provenance": [] } }, "nbformat": 4, "nbformat_minor": 0 }