Add project files
This commit is contained in:
commit
9d06c4e3e6
BIN
Dokument_Netflix_Galla.docx
Normal file
BIN
Dokument_Netflix_Galla.docx
Normal file
Binary file not shown.
190
UM_projekt.ipynb
Normal file
190
UM_projekt.ipynb
Normal file
@ -0,0 +1,190 @@
|
|||||||
|
{
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 0,
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.7.1"
|
||||||
|
},
|
||||||
|
"colab": {
|
||||||
|
"name": "UM_projekt.ipynb",
|
||||||
|
"provenance": []
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"metadata": {
|
||||||
|
"colab": {
|
||||||
|
"base_uri": "https://localhost:8080/"
|
||||||
|
},
|
||||||
|
"id": "CyNKlKu4mVOD",
|
||||||
|
"outputId": "b3f1fe30-a641-4d4e-872d-833739924713"
|
||||||
|
},
|
||||||
|
"source": [
|
||||||
|
"import pandas as pd\n",
|
||||||
|
"import seaborn as sns\n",
|
||||||
|
"import datetime\n",
|
||||||
|
"import matplotlib.pyplot as plt\n",
|
||||||
|
"from sklearn.metrics import mean_squared_error\n",
|
||||||
|
"\n",
|
||||||
|
"netflix=pd.read_csv('netflix_titles_enriched.csv')\n",
|
||||||
|
"\n",
|
||||||
|
"netflix_cleaned = netflix[netflix.rottentomatoes_audience_score > 0].sort_values(by = 'rottentomatoes_audience_score')\n",
|
||||||
|
"netflix_cleaned.rottentomatoes_audience_score /= 100\n",
|
||||||
|
"netflix_cleaned.drop(['rottentomatoes_audience_#reviews',\n",
|
||||||
|
" 'rottentomatoes_audience_review', 'rottentomatoes_tomatometer_score',\n",
|
||||||
|
" 'rottentomatoes_critics_#reviews', 'rottentomatoes_critic_review'], axis = 1)\n",
|
||||||
|
"\n",
|
||||||
|
"netflix_cleaned.date_added = netflix_cleaned.date_added.dropna().apply(lambda x: datetime.datetime.strptime(x[1:] if x[0] == ' ' else x, '%B %d, %Y'))\n",
|
||||||
|
"\n",
|
||||||
|
"netflix_cleaned.update(netflix_cleaned.select_dtypes(include = 'object').apply(lambda col: col.str.lower()))\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"movies = netflix_cleaned[netflix_cleaned.type == 'movie']\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"movies.duration = movies.duration.str.extract(r'(\\d*)( min)')[0].astype('int32')\n",
|
||||||
|
"\n",
|
||||||
|
"from sklearn.preprocessing import MultiLabelBinarizer\n",
|
||||||
|
"mlb = MultiLabelBinarizer()\n",
|
||||||
|
"\n",
|
||||||
|
"movies = movies.join(pd.DataFrame(mlb.fit_transform(movies.pop('listed_in').str.split(', ')),\n",
|
||||||
|
" columns=mlb.classes_,\n",
|
||||||
|
" index=movies.index))\n",
|
||||||
|
"movies.drop(['movies'], axis = 1)\n",
|
||||||
|
"\n",
|
||||||
|
"movies = movies[['release_year', 'duration',\n",
|
||||||
|
" 'rottentomatoes_audience_score',\n",
|
||||||
|
" 'action & adventure', 'anime features', 'children & family movies',\n",
|
||||||
|
" 'classic movies', 'comedies', 'cult movies', 'documentaries', 'dramas',\n",
|
||||||
|
" 'faith & spirituality', 'horror movies', 'independent movies',\n",
|
||||||
|
" 'international movies', 'lgbtq movies', 'movies', 'music & musicals',\n",
|
||||||
|
" 'romantic movies', 'sci-fi & fantasy', 'sports movies',\n",
|
||||||
|
" 'stand-up comedy', 'thrillers']]\n",
|
||||||
|
"\n",
|
||||||
|
"import sklearn\n",
|
||||||
|
"from sklearn.model_selection import train_test_split\n",
|
||||||
|
"\n",
|
||||||
|
"movies_train, movies_test = sklearn.model_selection.train_test_split(movies,test_size=0.20, random_state=42)\n",
|
||||||
|
"#movies_test, movies_val = sklearn.model_selection.train_test_split(movies_test,test_size=0.50, random_state=42)\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"x_train = movies_train.copy()\n",
|
||||||
|
"y_train = x_train.pop('rottentomatoes_audience_score')\n",
|
||||||
|
"#x_train.pop('Unnamed: 0')\n",
|
||||||
|
"\n",
|
||||||
|
"x_test = movies_test.copy()\n",
|
||||||
|
"y_test = x_test.pop('rottentomatoes_audience_score')\n",
|
||||||
|
"#y_test.pop('Unnamed: 0')\n"
|
||||||
|
],
|
||||||
|
"execution_count": 60,
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"2700\n",
|
||||||
|
"2160 540\n"
|
||||||
|
],
|
||||||
|
"name": "stdout"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"/usr/local/lib/python3.7/dist-packages/pandas/core/generic.py:5170: SettingWithCopyWarning: \n",
|
||||||
|
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
||||||
|
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
||||||
|
"\n",
|
||||||
|
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
||||||
|
" self[name] = value\n"
|
||||||
|
],
|
||||||
|
"name": "stderr"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"metadata": {
|
||||||
|
"id": "MqmpAk48n-LR"
|
||||||
|
},
|
||||||
|
"source": [
|
||||||
|
"from sklearn.linear_model import SGDRegressor\n",
|
||||||
|
"from sklearn.pipeline import make_pipeline\n",
|
||||||
|
"from sklearn.preprocessing import StandardScaler\n",
|
||||||
|
"modelSGD = make_pipeline(StandardScaler(), SGDRegressor())\n",
|
||||||
|
"modelSGD.fit(x_train, y_train) \n",
|
||||||
|
"y_predicted = modelSGD.predict(x_val)\n",
|
||||||
|
"errorSGD = mean_squared_error(y_val, y_predicted)"
|
||||||
|
],
|
||||||
|
"execution_count": 61,
|
||||||
|
"outputs": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"metadata": {
|
||||||
|
"id": "WLzrNTM7lxnE"
|
||||||
|
},
|
||||||
|
"source": [
|
||||||
|
"from sklearn.linear_model import LinearRegression\n",
|
||||||
|
"modelLR = make_pipeline(StandardScaler(), LinearRegression())\n",
|
||||||
|
"modelLR.fit(x_train, y_train) \n",
|
||||||
|
"y_predicted = modelLR.predict(x_val)\n",
|
||||||
|
"errorLR = mean_squared_error(y_val, y_predicted)"
|
||||||
|
],
|
||||||
|
"execution_count": 62,
|
||||||
|
"outputs": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"metadata": {
|
||||||
|
"id": "x4C2ExxamxU5"
|
||||||
|
},
|
||||||
|
"source": [
|
||||||
|
"from sklearn.svm import SVR\n",
|
||||||
|
"modelSVR = make_pipeline(StandardScaler(), SVR())\n",
|
||||||
|
"modelSVR.fit(x_train, y_train) \n",
|
||||||
|
"y_predicted = modelSVR.predict(x_val)\n",
|
||||||
|
"errorSVR = mean_squared_error(y_val, y_predicted)"
|
||||||
|
],
|
||||||
|
"execution_count": 63,
|
||||||
|
"outputs": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"metadata": {
|
||||||
|
"colab": {
|
||||||
|
"base_uri": "https://localhost:8080/"
|
||||||
|
},
|
||||||
|
"id": "TNj0zacwroWJ",
|
||||||
|
"outputId": "12378b41-418d-464f-a77f-8455af754953"
|
||||||
|
},
|
||||||
|
"source": [
|
||||||
|
"print(errorLR, errorSGD, errorSVR)"
|
||||||
|
],
|
||||||
|
"execution_count": 64,
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"0.040133803263361176 0.040076924018007165 0.04124993242855958\n"
|
||||||
|
],
|
||||||
|
"name": "stdout"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
9660
netflix_titles_enriched.csv
Normal file
9660
netflix_titles_enriched.csv
Normal file
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue
Block a user