Update repo

This commit is contained in:
s464968 2023-01-19 21:22:01 +01:00
parent a31ff34818
commit 18257d0941
32 changed files with 7708249 additions and 0 deletions

8
D/.idea/.gitignore vendored Normal file
View File

@ -0,0 +1,8 @@
# Default ignored files
/shelf/
/workspace.xml
# Editor-based HTTP Client requests
/httpRequests/
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml

8
D/.idea/D.iml Normal file
View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

View File

@ -0,0 +1,6 @@
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>

4
D/.idea/misc.xml Normal file
View File

@ -0,0 +1,4 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (D)" project-jdk-type="Python SDK" />
</project>

8
D/.idea/modules.xml Normal file
View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/D.iml" filepath="$PROJECT_DIR$/.idea/D.iml" />
</modules>
</component>
</project>

6
D/.idea/vcs.xml Normal file
View File

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$/.." vcs="Git" />
</component>
</project>

Binary file not shown.

Binary file not shown.

76
D/calclex.py Normal file
View File

@ -0,0 +1,76 @@
from ply import lex
tokens = (
'NUMBER',
'OPERATE',
'SIZE',
'KIND',
'COLOR',
'MATERIAL'
)
def t_OPERATE(t):
r'Buy | Sell'
return t
def t_NUMBER(t):
r'\d+'
t.value = int(t.value)
return t
def t_SIZE(t):
r'tiny | small | big | large'
if t.value == 'tiny':
t.value = 1
elif t.value == 'small':
t.value = 2
elif t.value == 'big':
t.value = 3
elif t.value == 'large':
t.value = 4
return t
def t_COLOR(t):
r'(black | white | red | green | blue)'
if t.value == 'black':
t.value = 1
elif t.value == 'white':
t.value = 2
elif t.value == 'red':
t.value = 3
elif t.value == 'green':
t.value = 4
elif t.value == 'blue':
t.value = 5
return t
def t_MATERIAL(t):
r'metal | plastic'
if t.value == 'metal':
t.value = 1
elif t.value == 'plastic':
t.value = 2
return t
def t_KIND(t):
r'box(es)? | ring(s)?'
if t.value[0] == 'b':
t.value = 1
else:
t.value = 2
return t
def t_error(t):
print("Illegal character '%s" % t.value[0])
t.lexer.skip(1)
t_ignore = ' \t'
lexer = lex.lex()

147
D/parser.out Normal file
View File

@ -0,0 +1,147 @@
Created by PLY version 3.11 (http://www.dabeaz.com/ply)
Grammar
Rule 0 S' -> command
Rule 1 command -> operate NUMBER article
Rule 2 article -> attribute article
Rule 3 attribute -> COLOR
Rule 4 attribute -> MATERIAL
Rule 5 attribute -> SIZE
Rule 6 article -> KIND
Rule 7 operate -> OPERATE
Terminals, with rules where they appear
COLOR : 3
KIND : 6
MATERIAL : 4
NUMBER : 1
OPERATE : 7
SIZE : 5
error :
Nonterminals, with rules where they appear
article : 1 2
attribute : 2
command : 0
operate : 1
Parsing method: LALR
state 0
(0) S' -> . command
(1) command -> . operate NUMBER article
(7) operate -> . OPERATE
OPERATE shift and go to state 3
command shift and go to state 1
operate shift and go to state 2
state 1
(0) S' -> command .
state 2
(1) command -> operate . NUMBER article
NUMBER shift and go to state 4
state 3
(7) operate -> OPERATE .
NUMBER reduce using rule 7 (operate -> OPERATE .)
state 4
(1) command -> operate NUMBER . article
(2) article -> . attribute article
(6) article -> . KIND
(3) attribute -> . COLOR
(4) attribute -> . MATERIAL
(5) attribute -> . SIZE
KIND shift and go to state 7
COLOR shift and go to state 8
MATERIAL shift and go to state 9
SIZE shift and go to state 10
article shift and go to state 5
attribute shift and go to state 6
state 5
(1) command -> operate NUMBER article .
$end reduce using rule 1 (command -> operate NUMBER article .)
state 6
(2) article -> attribute . article
(2) article -> . attribute article
(6) article -> . KIND
(3) attribute -> . COLOR
(4) attribute -> . MATERIAL
(5) attribute -> . SIZE
KIND shift and go to state 7
COLOR shift and go to state 8
MATERIAL shift and go to state 9
SIZE shift and go to state 10
attribute shift and go to state 6
article shift and go to state 11
state 7
(6) article -> KIND .
$end reduce using rule 6 (article -> KIND .)
state 8
(3) attribute -> COLOR .
KIND reduce using rule 3 (attribute -> COLOR .)
COLOR reduce using rule 3 (attribute -> COLOR .)
MATERIAL reduce using rule 3 (attribute -> COLOR .)
SIZE reduce using rule 3 (attribute -> COLOR .)
state 9
(4) attribute -> MATERIAL .
KIND reduce using rule 4 (attribute -> MATERIAL .)
COLOR reduce using rule 4 (attribute -> MATERIAL .)
MATERIAL reduce using rule 4 (attribute -> MATERIAL .)
SIZE reduce using rule 4 (attribute -> MATERIAL .)
state 10
(5) attribute -> SIZE .
KIND reduce using rule 5 (attribute -> SIZE .)
COLOR reduce using rule 5 (attribute -> SIZE .)
MATERIAL reduce using rule 5 (attribute -> SIZE .)
SIZE reduce using rule 5 (attribute -> SIZE .)
state 11
(2) article -> attribute article .
$end reduce using rule 2 (article -> attribute article .)

37
D/parsetab.py Normal file
View File

@ -0,0 +1,37 @@
# parsetab.py
# This file is automatically generated. Do not edit.
# pylint: disable=W,C,R
_tabversion = '3.10'
_lr_method = 'LALR'
_lr_signature = 'COLOR KIND MATERIAL NUMBER OPERATE SIZEcommand : operate NUMBER articlearticle : attribute articleattribute : COLORattribute : MATERIALattribute : SIZEarticle : KINDoperate : OPERATE'
_lr_action_items = {'OPERATE':([0,],[3,]),'$end':([1,5,7,11,],[0,-1,-6,-2,]),'NUMBER':([2,3,],[4,-7,]),'KIND':([4,6,8,9,10,],[7,7,-3,-4,-5,]),'COLOR':([4,6,8,9,10,],[8,8,-3,-4,-5,]),'MATERIAL':([4,6,8,9,10,],[9,9,-3,-4,-5,]),'SIZE':([4,6,8,9,10,],[10,10,-3,-4,-5,]),}
_lr_action = {}
for _k, _v in _lr_action_items.items():
for _x,_y in zip(_v[0],_v[1]):
if not _x in _lr_action: _lr_action[_x] = {}
_lr_action[_x][_k] = _y
del _lr_action_items
_lr_goto_items = {'command':([0,],[1,]),'operate':([0,],[2,]),'article':([4,6,],[5,11,]),'attribute':([4,6,],[6,6,]),}
_lr_goto = {}
for _k, _v in _lr_goto_items.items():
for _x, _y in zip(_v[0], _v[1]):
if not _x in _lr_goto: _lr_goto[_x] = {}
_lr_goto[_x][_k] = _y
del _lr_goto_items
_lr_productions = [
("S' -> command","S'",1,None,None,None),
('command -> operate NUMBER article','command',3,'p_command','yacc.py',6),
('article -> attribute article','article',2,'p_article_attribute','yacc.py',22),
('attribute -> COLOR','attribute',1,'p_attribute_color','yacc.py',27),
('attribute -> MATERIAL','attribute',1,'p_attribute_material','yacc.py',32),
('attribute -> SIZE','attribute',1,'p_attribute_size','yacc.py',37),
('article -> KIND','article',1,'p_article_kind','yacc.py',42),
('operate -> OPERATE','operate',1,'p_operate','yacc.py',46),
]

66
D/yacc.py Normal file
View File

@ -0,0 +1,66 @@
from ply import yacc
from calclex import tokens
def p_command(p):
'command : operate NUMBER article'
index = p[3]
if p[1] == 'Buy':
tab[index] += p[2]
print('OK. I am buying ' + str(p[2]) + ' new articles indexed as ' + str(index) + '.')
print('No of articles in shop: ' + str(tab[index]))
elif p[1] == 'Sell':
if p[2] > tab[index]:
print('I do not have as many articles as you want.')
else:
tab[index] -= p[2]
print('OK. I am selling ' + str(p[2]) + ' articles indexed as ' + str(index) + '.')
print('No of articles in shop: ' + str(tab[index]))
def p_article_attribute(p):
'article : attribute article'
p[0] = p[1] + p[2]
def p_attribute_color(p):
'attribute : COLOR'
p[0] = p[1]
def p_attribute_material(p):
'attribute : MATERIAL'
p[0] = 10 * p[1]
def p_attribute_size(p):
'attribute : SIZE'
p[0] = 100 * p[1]
def p_article_kind(p):
'article : KIND'
p[0] = 1000 * p[1]
def p_operate(p):
'operate : OPERATE'
p[0] = p[1]
def p_error(p):
print("Syntax error in input!")
tab = []
for index in range(3000):
tab.append(0)
parser = yacc.yacc()
while True:
s = input('What can I do for you? \n')
if s == 'Bye':
break
parser.parse(s)

8
E/.idea/.gitignore vendored Normal file
View File

@ -0,0 +1,8 @@
# Default ignored files
/shelf/
/workspace.xml
# Editor-based HTTP Client requests
/httpRequests/
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml

8
E/.idea/E.iml Normal file
View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="jdk" jdkName="Python 3.10" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

View File

@ -0,0 +1,6 @@
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>

4
E/.idea/misc.xml Normal file
View File

@ -0,0 +1,4 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10" project-jdk-type="Python SDK" />
</project>

8
E/.idea/modules.xml Normal file
View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/E.iml" filepath="$PROJECT_DIR$/.idea/E.iml" />
</modules>
</component>
</project>

6
E/.idea/vcs.xml Normal file
View File

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$/.." vcs="Git" />
</component>
</project>

BIN
E/E.zip Normal file

Binary file not shown.

6578142
E/PoliMorf-0.6.7.tab Normal file

File diff suppressed because it is too large Load Diff

50
E/bpe.py Normal file
View File

@ -0,0 +1,50 @@
import string
def bpe(sentence: str, V:int) -> list:
# remove punctuation
sentence = sentence.translate(str.maketrans("", "", string.punctuation)).lower()
characters = [i if i != " " else "<w>" for i in sentence]
characters.append("<w>")
characters.insert(0, "<w>")
vocabulary = set(characters)
while len(vocabulary) < V:
bigrams = get_bigrams(characters)
frequencies = get_frequencies(bigrams)
most_freq_bigr = max(frequencies,key=frequencies.get)
upd_sentence_with_bigram(characters,most_freq_bigr)
vocabulary.add(most_freq_bigr)
return vocabulary
def get_bigrams(characters: list) -> list:
bigrams = []
for i in range(0, len(characters) - 1):
bigrams.append(characters[i] + characters[i + 1])
return bigrams
def get_frequencies(item: list) -> dict:
frequencies = {}
for i in item:
if i not in frequencies:
frequencies[i] = 1
else:
frequencies[i] += 1
return frequencies
def upd_sentence_with_bigram(chars:list, bigram:str) -> list:
i = 0
while i < len(chars) - 1:
if chars[i] + chars[i + 1] == bigram:
chars.pop(i + 1)
chars.pop(i)
chars.insert(i, bigram)
i+=1
return chars
usr_input = input('Podaj zdanie: ')
V = int(input('Podaj wielkosc slownika: '))
vocab = bpe(usr_input, V)
for i in vocab: print(i)

22
E/maxmatch.py Normal file
View File

@ -0,0 +1,22 @@
import pandas as pd
def max_match(sentence: str, dictionary: list) -> list:
if not sentence:
return []
sen_len = len(sentence)
for i in range(sen_len - 1, -1, -1):
firstword = sentence[0:i]
remainder = sentence[i:sen_len]
if firstword in dictionary:
return [firstword] + max_match(remainder, dictionary)
firstword = sentence[0]
remainder = sentence[1:sen_len]
return [firstword] + max_match(remainder, dictionary)
pm = pd.read_csv("PoliMorf-0.6.7.tab", delimiter="\t", header=None)
polish_dict = list(pm[0])
result = max_match("Alamakota", polish_dict)
print(result)

26
G/G.py Normal file
View File

@ -0,0 +1,26 @@
import spacy
class Imp:
action_s = ''
object_s = ''
nlp = spacy.load("pl_core_news_sm")
inputed_text = input('Podaj zdanie: ')
doc = nlp(inputed_text)
for token in doc:
print(token.text, token.pos_)
contains_imp = any(token.morph.get('Mood') == ['Imp'] for token in doc)
if not contains_imp:
print('Bye!')
else:
imp_list = []
index = -1
for token in doc:
if token.tag_ == 'IMPT':
imp_list.append(Imp())
index+=1
imp_list[index].action_s = token.text
else:
imp_list[index].object_s += token.text + " "
for imp in imp_list:
print('Action: '+imp.action_s+", Object: "+imp.object_s)

375
P1/Movielens/main.ipynb Normal file
View File

@ -0,0 +1,375 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"%matplotlib inline"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>MovieID</th>\n",
" <th>MovieName</th>\n",
" <th>Category</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>Toy Story (1995)</td>\n",
" <td>Animation|Children's|Comedy</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>Jumanji (1995)</td>\n",
" <td>Adventure|Children's|Fantasy</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>Grumpier Old Men (1995)</td>\n",
" <td>Comedy|Romance</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>Waiting to Exhale (1995)</td>\n",
" <td>Comedy|Drama</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>Father of the Bride Part II (1995)</td>\n",
" <td>Comedy</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" MovieID MovieName Category\n",
"0 1 Toy Story (1995) Animation|Children's|Comedy\n",
"1 2 Jumanji (1995) Adventure|Children's|Fantasy\n",
"2 3 Grumpier Old Men (1995) Comedy|Romance\n",
"3 4 Waiting to Exhale (1995) Comedy|Drama\n",
"4 5 Father of the Bride Part II (1995) Comedy"
]
},
"execution_count": 48,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_movie = pd.read_csv('movies.dat',sep='::',encoding='latin1',engine='python',names=['MovieID','MovieName','Category'])\n",
"df_movie.dropna(inplace=True)\n",
"df_movie.head()"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ID</th>\n",
" <th>MovieID</th>\n",
" <th>Ratings</th>\n",
" <th>TimeStamp</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>1193</td>\n",
" <td>5</td>\n",
" <td>978300760</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>661</td>\n",
" <td>3</td>\n",
" <td>978302109</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>914</td>\n",
" <td>3</td>\n",
" <td>978301968</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>3408</td>\n",
" <td>4</td>\n",
" <td>978300275</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>2355</td>\n",
" <td>5</td>\n",
" <td>978824291</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" ID MovieID Ratings TimeStamp\n",
"0 1 1193 5 978300760\n",
"1 1 661 3 978302109\n",
"2 1 914 3 978301968\n",
"3 1 3408 4 978300275\n",
"4 1 2355 5 978824291"
]
},
"execution_count": 49,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_rating = pd.read_csv('ratings.dat',sep='::',encoding='latin1',engine='python',names=['ID','MovieID','Ratings','TimeStamp'])\n",
"df_rating.dropna(inplace=True)\n",
"df_rating.head()"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>MovieID</th>\n",
" <th>MovieName</th>\n",
" <th>Category</th>\n",
" <th>ID</th>\n",
" <th>Ratings</th>\n",
" <th>TimeStamp</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>Toy Story (1995)</td>\n",
" <td>Animation|Children's|Comedy</td>\n",
" <td>1</td>\n",
" <td>5</td>\n",
" <td>978824268</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>Toy Story (1995)</td>\n",
" <td>Animation|Children's|Comedy</td>\n",
" <td>6</td>\n",
" <td>4</td>\n",
" <td>978237008</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>Toy Story (1995)</td>\n",
" <td>Animation|Children's|Comedy</td>\n",
" <td>8</td>\n",
" <td>4</td>\n",
" <td>978233496</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>Toy Story (1995)</td>\n",
" <td>Animation|Children's|Comedy</td>\n",
" <td>9</td>\n",
" <td>5</td>\n",
" <td>978225952</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>Toy Story (1995)</td>\n",
" <td>Animation|Children's|Comedy</td>\n",
" <td>10</td>\n",
" <td>5</td>\n",
" <td>978226474</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" MovieID MovieName Category ID Ratings \\\n",
"0 1 Toy Story (1995) Animation|Children's|Comedy 1 5 \n",
"1 1 Toy Story (1995) Animation|Children's|Comedy 6 4 \n",
"2 1 Toy Story (1995) Animation|Children's|Comedy 8 4 \n",
"3 1 Toy Story (1995) Animation|Children's|Comedy 9 5 \n",
"4 1 Toy Story (1995) Animation|Children's|Comedy 10 5 \n",
"\n",
" TimeStamp \n",
"0 978824268 \n",
"1 978237008 \n",
"2 978233496 \n",
"3 978225952 \n",
"4 978226474 "
]
},
"execution_count": 50,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.merge(df_movie,df_rating,left_on='MovieID',right_on='MovieID')\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 60,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"MovieID\n",
"1 8.293693\n",
"2 6.402282\n",
"3 6.033473\n",
"4 5.458824\n",
"5 6.013514\n",
" ... \n",
"3948 7.271462\n",
"3949 8.230263\n",
"3950 7.333333\n",
"3951 7.800000\n",
"3952 7.561856\n",
"Name: Ratings, Length: 3706, dtype: float64"
]
},
"execution_count": 60,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"groupByMovie = df.groupby('MovieID')\n",
"movieRatingsMean = groupByMovie['Ratings'].mean()*2\n",
"movieRatingsMean.columns = ['MovieID','Mean']\n",
"movieRatingsMean"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10 (default, Nov 14 2022, 12:59:47) \n[GCC 9.4.0]"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}

3883
P1/Movielens/movies.dat Normal file

File diff suppressed because it is too large Load Diff

1000209
P1/Movielens/ratings.dat Normal file

File diff suppressed because it is too large Load Diff

6040
P1/Movielens/users.dat Normal file

File diff suppressed because it is too large Load Diff

484
P1/imdb/main.ipynb Normal file
View File

@ -0,0 +1,484 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 51,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"%matplotlib inline\n",
"\n",
"import os"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>MOVIES</th>\n",
" <th>YEAR</th>\n",
" <th>GENRE</th>\n",
" <th>RATING</th>\n",
" <th>ONE-LINE</th>\n",
" <th>STARS</th>\n",
" <th>VOTES</th>\n",
" <th>RunTime</th>\n",
" <th>Gross</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Blood Red Sky</td>\n",
" <td>(2021)</td>\n",
" <td>\\nAction, Horror, Thriller</td>\n",
" <td>6.1</td>\n",
" <td>\\nA woman with a mysterious illness is forced ...</td>\n",
" <td>\\n Director:\\nPeter Thorwarth\\n| \\n Star...</td>\n",
" <td>21,062</td>\n",
" <td>121.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Masters of the Universe: Revelation</td>\n",
" <td>(2021 )</td>\n",
" <td>\\nAnimation, Action, Adventure</td>\n",
" <td>5.0</td>\n",
" <td>\\nThe war for Eternia begins again in what may...</td>\n",
" <td>\\n \\n Stars:\\nChris Wood, \\nSara...</td>\n",
" <td>17,870</td>\n",
" <td>25.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>The Walking Dead</td>\n",
" <td>(20102022)</td>\n",
" <td>\\nDrama, Horror, Thriller</td>\n",
" <td>8.2</td>\n",
" <td>\\nSheriff Deputy Rick Grimes wakes up from a c...</td>\n",
" <td>\\n \\n Stars:\\nAndrew Lincoln, \\n...</td>\n",
" <td>885,805</td>\n",
" <td>44.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Rick and Morty</td>\n",
" <td>(2013 )</td>\n",
" <td>\\nAnimation, Adventure, Comedy</td>\n",
" <td>9.2</td>\n",
" <td>\\nAn animated series that follows the exploits...</td>\n",
" <td>\\n \\n Stars:\\nJustin Roiland, \\n...</td>\n",
" <td>414,849</td>\n",
" <td>23.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Army of Thieves</td>\n",
" <td>(2021)</td>\n",
" <td>\\nAction, Crime, Horror</td>\n",
" <td>NaN</td>\n",
" <td>\\nA prequel, set before the events of Army of ...</td>\n",
" <td>\\n Director:\\nMatthias Schweighöfer\\n| \\n ...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" MOVIES YEAR \\\n",
"0 Blood Red Sky (2021) \n",
"1 Masters of the Universe: Revelation (2021 ) \n",
"2 The Walking Dead (20102022) \n",
"3 Rick and Morty (2013 ) \n",
"4 Army of Thieves (2021) \n",
"\n",
" GENRE RATING \\\n",
"0 \\nAction, Horror, Thriller 6.1 \n",
"1 \\nAnimation, Action, Adventure 5.0 \n",
"2 \\nDrama, Horror, Thriller 8.2 \n",
"3 \\nAnimation, Adventure, Comedy 9.2 \n",
"4 \\nAction, Crime, Horror NaN \n",
"\n",
" ONE-LINE \\\n",
"0 \\nA woman with a mysterious illness is forced ... \n",
"1 \\nThe war for Eternia begins again in what may... \n",
"2 \\nSheriff Deputy Rick Grimes wakes up from a c... \n",
"3 \\nAn animated series that follows the exploits... \n",
"4 \\nA prequel, set before the events of Army of ... \n",
"\n",
" STARS VOTES RunTime Gross \n",
"0 \\n Director:\\nPeter Thorwarth\\n| \\n Star... 21,062 121.0 NaN \n",
"1 \\n \\n Stars:\\nChris Wood, \\nSara... 17,870 25.0 NaN \n",
"2 \\n \\n Stars:\\nAndrew Lincoln, \\n... 885,805 44.0 NaN \n",
"3 \\n \\n Stars:\\nJustin Roiland, \\n... 414,849 23.0 NaN \n",
"4 \\n Director:\\nMatthias Schweighöfer\\n| \\n ... NaN NaN NaN "
]
},
"execution_count": 52,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.read_csv('movies.csv',header=0)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>MOVIES</th>\n",
" <th>YEAR</th>\n",
" <th>GENRE</th>\n",
" <th>RATING</th>\n",
" <th>ONE-LINE</th>\n",
" <th>STARS</th>\n",
" <th>VOTES</th>\n",
" <th>RunTime</th>\n",
" <th>Gross</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Blood Red Sky</td>\n",
" <td>(2021)</td>\n",
" <td>Action, Horror, Thriller</td>\n",
" <td>6.1</td>\n",
" <td>A woman with a mysterious illness is forced in...</td>\n",
" <td>Director:Peter Thorwarth| Stars:Peri Baume...</td>\n",
" <td>21,062</td>\n",
" <td>121.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Masters of the Universe: Revelation</td>\n",
" <td>(2021 )</td>\n",
" <td>Animation, Action, Adventure</td>\n",
" <td>5.0</td>\n",
" <td>The war for Eternia begins again in what may b...</td>\n",
" <td>Stars:Chris Wood, Sarah Michelle Gellar, Lena ...</td>\n",
" <td>17,870</td>\n",
" <td>25.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>The Walking Dead</td>\n",
" <td>(20102022)</td>\n",
" <td>Drama, Horror, Thriller</td>\n",
" <td>8.2</td>\n",
" <td>Sheriff Deputy Rick Grimes wakes up from a com...</td>\n",
" <td>Stars:Andrew Lincoln, Norman Reedus, Melissa M...</td>\n",
" <td>885,805</td>\n",
" <td>44.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Rick and Morty</td>\n",
" <td>(2013 )</td>\n",
" <td>Animation, Adventure, Comedy</td>\n",
" <td>9.2</td>\n",
" <td>An animated series that follows the exploits o...</td>\n",
" <td>Stars:Justin Roiland, Chris Parnell, Spencer G...</td>\n",
" <td>414,849</td>\n",
" <td>23.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Army of Thieves</td>\n",
" <td>(2021)</td>\n",
" <td>Action, Crime, Horror</td>\n",
" <td>NaN</td>\n",
" <td>A prequel, set before the events of Army of th...</td>\n",
" <td>Director:Matthias Schweighöfer| Stars:Matt...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" MOVIES YEAR \\\n",
"0 Blood Red Sky (2021) \n",
"1 Masters of the Universe: Revelation (2021 ) \n",
"2 The Walking Dead (20102022) \n",
"3 Rick and Morty (2013 ) \n",
"4 Army of Thieves (2021) \n",
"\n",
" GENRE RATING \\\n",
"0 Action, Horror, Thriller 6.1 \n",
"1 Animation, Action, Adventure 5.0 \n",
"2 Drama, Horror, Thriller 8.2 \n",
"3 Animation, Adventure, Comedy 9.2 \n",
"4 Action, Crime, Horror NaN \n",
"\n",
" ONE-LINE \\\n",
"0 A woman with a mysterious illness is forced in... \n",
"1 The war for Eternia begins again in what may b... \n",
"2 Sheriff Deputy Rick Grimes wakes up from a com... \n",
"3 An animated series that follows the exploits o... \n",
"4 A prequel, set before the events of Army of th... \n",
"\n",
" STARS VOTES RunTime Gross \n",
"0 Director:Peter Thorwarth| Stars:Peri Baume... 21,062 121.0 NaN \n",
"1 Stars:Chris Wood, Sarah Michelle Gellar, Lena ... 17,870 25.0 NaN \n",
"2 Stars:Andrew Lincoln, Norman Reedus, Melissa M... 885,805 44.0 NaN \n",
"3 Stars:Justin Roiland, Chris Parnell, Spencer G... 414,849 23.0 NaN \n",
"4 Director:Matthias Schweighöfer| Stars:Matt... NaN NaN NaN "
]
},
"execution_count": 53,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['GENRE'] = df['GENRE'].str.replace('\\n','')\n",
"df['ONE-LINE'] = df['ONE-LINE'].str.replace('\\n','')\n",
"df['STARS'] = df['STARS'].str.replace('\\n','')\n",
"\n",
"df['GENRE'] = df['GENRE'].str.strip()\n",
"df['ONE-LINE'] = df['ONE-LINE'].str.strip()\n",
"df['STARS'] = df['STARS'].str.strip()\n",
"\n",
"# df['YEAR'] = df['YEAR'].str.strip('()')\n",
"\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 Peri Baumeister, Carl Anton Koch, Alexander Sc...\n",
"1 Chris Wood, Sarah Michelle Gellar, Lena Headey...\n",
"2 Andrew Lincoln, Norman Reedus, Melissa McBride...\n",
"3 Justin Roiland, Chris Parnell, Spencer Grammer...\n",
"4 Matthias Schweighöfer, Nathalie Emmanuel, Ruby...\n",
" ... \n",
"9993 Felix Klare, Romina Küper, Anna Maria Mühe, Ro...\n",
"9994 Morgan Taylor Campbell, Chris Cope, Iñaki Godo...\n",
"9996 Prince Harry\n",
"9997 Morgan Taylor Campbell, Iñaki Godoy, Rhianna J...\n",
"9998 Morgan Taylor Campbell, Jennifer Cheon Garcia,...\n",
"Name: Directors, Length: 9206, dtype: object"
]
},
"execution_count": 54,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import re\n",
"\n",
"def extract_director(direct):\n",
" result = re.search(r'(Director:|Directors:)(.*)\\|',direct)\n",
" if result:\n",
" return result.group(2).strip()\n",
"\n",
"def extract_stars(stars):\n",
" result = re.search(r'(Stars:|Star:)(.*)',stars)\n",
" if result:\n",
" return result.group(2).strip()\n",
"\n",
"df['Stars'] = df['STARS'].apply(lambda d : extract_director(d))\n",
"df['Directors'] = df['STARS'].apply(lambda s : extract_stars(s))\n",
"df['Stars'].dropna()\n",
"df['Directors'].dropna()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Year</th>\n",
" <th>Count</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2020</td>\n",
" <td>898</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2020</td>\n",
" <td>742</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2021</td>\n",
" <td>661</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2019</td>\n",
" <td>657</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2019</td>\n",
" <td>553</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Year Count\n",
"0 2020 898\n",
"1 2020 742\n",
"2 2021 661\n",
"3 2019 657\n",
"4 2019 553"
]
},
"execution_count": 55,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['Year'] = df['YEAR'].str.extract(r'([0-9]{4}.*|[0-9]{4})')\n",
"df['Year'] = df['Year'].str.strip().replace(\")\",\"\")\n",
"\n",
"def extract_year(year):\n",
" if year[-3:] == ' )':\n",
" return year.replace(' )',\"\")\n",
" else:\n",
" return year.replace(')',\"\")\n",
"\n",
"df['Year'] = df['Year'].fillna('Unknown')\n",
"df['Year'] = df['Year'].apply(lambda y: extract_year(y))\n",
" \n",
"year_count = df[df['Year'] != 'Unknown']['Year'].value_counts().reset_index().rename(columns = {'Year':'Count','index':'Year'})\n",
"year_count.head()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}

108876
P1/imdb/movies.csv Normal file

File diff suppressed because it is too large Load Diff

128
P1/tmdb/main.ipynb Normal file
View File

@ -0,0 +1,128 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [],
"source": [
"movies = pd.read_csv('tmdb_5000_movies.csv')\n",
"credits = pd.read_csv('tmdb_5000_credits.csv')\n",
"df = pd.merge(movies,credits,left_on=['id','title'],right_on=['movie_id','title'])"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"pandas.core.series.Series"
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import json\n",
"def load_json_columns(col):\n",
" col = col.apply(json.loads)\n",
"load_json_columns(df['genres'])\n",
"# df['genres'] = df['genres'].apply(json.loads)\n",
"# df['keywords'] = df['keywords'].apply(json.loads)\n",
"# df['production_companies'] = df['production_companies'].apply(json.loads)\n",
"# df['production_countries'] = df['production_countries'].apply(json.loads)\n",
"# df['cast'] = df['cast'].apply(json.loads)\n",
"# df['crew'] = df['crew'].apply(json.loads)"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[{'id': 1463, 'name': 'culture clash'},\n",
" {'id': 2964, 'name': 'future'},\n",
" {'id': 3386, 'name': 'space war'},\n",
" {'id': 3388, 'name': 'space colony'},\n",
" {'id': 3679, 'name': 'society'},\n",
" {'id': 3801, 'name': 'space travel'},\n",
" {'id': 9685, 'name': 'futuristic'},\n",
" {'id': 9840, 'name': 'romance'},\n",
" {'id': 9882, 'name': 'space'},\n",
" {'id': 9951, 'name': 'alien'},\n",
" {'id': 10148, 'name': 'tribe'},\n",
" {'id': 10158, 'name': 'alien planet'},\n",
" {'id': 10987, 'name': 'cgi'},\n",
" {'id': 11399, 'name': 'marine'},\n",
" {'id': 13065, 'name': 'soldier'},\n",
" {'id': 14643, 'name': 'battle'},\n",
" {'id': 14720, 'name': 'love affair'},\n",
" {'id': 165431, 'name': 'anti war'},\n",
" {'id': 193554, 'name': 'power relations'},\n",
" {'id': 206690, 'name': 'mind and soul'},\n",
" {'id': 209714, 'name': '3d'}]"
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['keywords'][0]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}

File diff suppressed because one or more lines are too long

4804
P1/tmdb/tmdb_5000_movies.csv Normal file

File diff suppressed because one or more lines are too long