-Added scrapper for interests -Added scrapper for movies links for interests -generated whole data for interests (genre,subgenres names and links), example of movies links for action subgenre -added config.py for whole scrapping purpouses -modified .gitignore to ignore __pycache__ folders

This commit is contained in:
jakzar 2024-12-23 00:19:02 +01:00
parent 0bed2c8765
commit a0ce4e175f
6 changed files with 1029 additions and 1 deletions

3
.gitignore vendored
View File

@ -1 +1,2 @@
.venv
.venv
**/__pycache__/

894
data/interest.json Normal file
View File

@ -0,0 +1,894 @@
{
"Action": [
[
"Action",
"https://www.imdb.com/interest/in0000001/?ref_=ints_cat_1_in_t_1"
],
[
"Action Epic",
"https://www.imdb.com/interest/in0000002/?ref_=ints_cat_1_in_t_2"
],
[
"B-Action",
"https://www.imdb.com/interest/in0000003/?ref_=ints_cat_1_in_t_3"
],
[
"Car Action",
"https://www.imdb.com/interest/in0000004/?ref_=ints_cat_1_in_t_4"
],
[
"Disaster",
"https://www.imdb.com/interest/in0000005/?ref_=ints_cat_1_in_t_5"
],
[
"Gun Fu",
"https://www.imdb.com/interest/in0000197/?ref_=ints_cat_1_in_t_6"
],
[
"Kung Fu",
"https://www.imdb.com/interest/in0000198/?ref_=ints_cat_1_in_t_7"
],
[
"Martial Arts",
"https://www.imdb.com/interest/in0000006/?ref_=ints_cat_1_in_t_8"
],
[
"One-Person Army Action",
"https://www.imdb.com/interest/in0000007/?ref_=ints_cat_1_in_t_9"
],
[
"Samurai",
"https://www.imdb.com/interest/in0000199/?ref_=ints_cat_1_in_t_10"
],
[
"Superhero",
"https://www.imdb.com/interest/in0000008/?ref_=ints_cat_1_in_t_11"
],
[
"Sword & Sandal",
"https://www.imdb.com/interest/in0000009/?ref_=ints_cat_1_in_t_12"
],
[
"War",
"https://www.imdb.com/interest/in0000010/?ref_=ints_cat_1_in_t_13"
],
[
"War Epic",
"https://www.imdb.com/interest/in0000011/?ref_=ints_cat_1_in_t_14"
],
[
"Wuxia",
"https://www.imdb.com/interest/in0000200/?ref_=ints_cat_1_in_t_15"
]
],
"Adventure": [
[
"Adventure",
"https://www.imdb.com/interest/in0000012/?ref_=ints_cat_2_in_t_1"
],
[
"Adventure Epic",
"https://www.imdb.com/interest/in0000015/?ref_=ints_cat_2_in_t_2"
],
[
"Desert Adventure",
"https://www.imdb.com/interest/in0000013/?ref_=ints_cat_2_in_t_3"
],
[
"Dinosaur Adventure",
"https://www.imdb.com/interest/in0000014/?ref_=ints_cat_2_in_t_4"
],
[
"Globetrotting Adventure",
"https://www.imdb.com/interest/in0000016/?ref_=ints_cat_2_in_t_5"
],
[
"Jungle Adventure",
"https://www.imdb.com/interest/in0000017/?ref_=ints_cat_2_in_t_6"
],
[
"Mountain Adventure",
"https://www.imdb.com/interest/in0000018/?ref_=ints_cat_2_in_t_7"
],
[
"Quest",
"https://www.imdb.com/interest/in0000019/?ref_=ints_cat_2_in_t_8"
],
[
"Road Trip",
"https://www.imdb.com/interest/in0000020/?ref_=ints_cat_2_in_t_9"
],
[
"Sea Adventure",
"https://www.imdb.com/interest/in0000021/?ref_=ints_cat_2_in_t_10"
],
[
"Swashbuckler",
"https://www.imdb.com/interest/in0000022/?ref_=ints_cat_2_in_t_11"
],
[
"Teen Adventure",
"https://www.imdb.com/interest/in0000023/?ref_=ints_cat_2_in_t_12"
],
[
"Urban Adventure",
"https://www.imdb.com/interest/in0000024/?ref_=ints_cat_2_in_t_13"
]
],
"Animation": [
[
"Adult Animation",
"https://www.imdb.com/interest/in0000025/?ref_=ints_cat_3_in_t_1"
],
[
"Animation",
"https://www.imdb.com/interest/in0000026/?ref_=ints_cat_3_in_t_2"
],
[
"Computer Animation",
"https://www.imdb.com/interest/in0000028/?ref_=ints_cat_3_in_t_3"
],
[
"Hand-Drawn Animation",
"https://www.imdb.com/interest/in0000029/?ref_=ints_cat_3_in_t_4"
],
[
"Stop Motion Animation",
"https://www.imdb.com/interest/in0000030/?ref_=ints_cat_3_in_t_5"
]
],
"Anime": [
[
"Anime",
"https://www.imdb.com/interest/in0000027/?ref_=ints_cat_4_in_t_1"
],
[
"Isekai",
"https://www.imdb.com/interest/in0000201/?ref_=ints_cat_4_in_t_2"
],
[
"Iyashikei",
"https://www.imdb.com/interest/in0000202/?ref_=ints_cat_4_in_t_3"
],
[
"Josei",
"https://www.imdb.com/interest/in0000203/?ref_=ints_cat_4_in_t_4"
],
[
"Mecha",
"https://www.imdb.com/interest/in0000204/?ref_=ints_cat_4_in_t_5"
],
[
"Seinen",
"https://www.imdb.com/interest/in0000205/?ref_=ints_cat_4_in_t_6"
],
[
"Sh\u014djo",
"https://www.imdb.com/interest/in0000207/?ref_=ints_cat_4_in_t_7"
],
[
"Sh\u014dnen",
"https://www.imdb.com/interest/in0000206/?ref_=ints_cat_4_in_t_8"
],
[
"Slice of Life",
"https://www.imdb.com/interest/in0000208/?ref_=ints_cat_4_in_t_9"
]
],
"Comedy": [
[
"Body Swap Comedy",
"https://www.imdb.com/interest/in0000031/?ref_=ints_cat_5_in_t_1"
],
[
"Buddy Comedy",
"https://www.imdb.com/interest/in0000032/?ref_=ints_cat_5_in_t_2"
],
[
"Buddy Cop",
"https://www.imdb.com/interest/in0000033/?ref_=ints_cat_5_in_t_3"
],
[
"Comedy",
"https://www.imdb.com/interest/in0000034/?ref_=ints_cat_5_in_t_4"
],
[
"Dark Comedy",
"https://www.imdb.com/interest/in0000035/?ref_=ints_cat_5_in_t_5"
],
[
"Farce",
"https://www.imdb.com/interest/in0000036/?ref_=ints_cat_5_in_t_6"
],
[
"High-Concept Comedy",
"https://www.imdb.com/interest/in0000037/?ref_=ints_cat_5_in_t_7"
],
[
"Mockumentary",
"https://www.imdb.com/interest/in0000038/?ref_=ints_cat_5_in_t_8"
],
[
"Parody",
"https://www.imdb.com/interest/in0000039/?ref_=ints_cat_5_in_t_9"
],
[
"Quirky Comedy",
"https://www.imdb.com/interest/in0000040/?ref_=ints_cat_5_in_t_10"
],
[
"Raunchy Comedy",
"https://www.imdb.com/interest/in0000041/?ref_=ints_cat_5_in_t_11"
],
[
"Satire",
"https://www.imdb.com/interest/in0000042/?ref_=ints_cat_5_in_t_12"
],
[
"Screwball Comedy",
"https://www.imdb.com/interest/in0000043/?ref_=ints_cat_5_in_t_13"
],
[
"Sitcom",
"https://www.imdb.com/interest/in0000044/?ref_=ints_cat_5_in_t_14"
],
[
"Sketch Comedy",
"https://www.imdb.com/interest/in0000045/?ref_=ints_cat_5_in_t_15"
],
[
"Slapstick",
"https://www.imdb.com/interest/in0000046/?ref_=ints_cat_5_in_t_16"
],
[
"Stand-Up",
"https://www.imdb.com/interest/in0000047/?ref_=ints_cat_5_in_t_17"
],
[
"Stoner Comedy",
"https://www.imdb.com/interest/in0000048/?ref_=ints_cat_5_in_t_18"
],
[
"Teen Comedy",
"https://www.imdb.com/interest/in0000049/?ref_=ints_cat_5_in_t_19"
]
],
"Crime": [
[
"Caper",
"https://www.imdb.com/interest/in0000050/?ref_=ints_cat_6_in_t_1"
],
[
"Cop Drama",
"https://www.imdb.com/interest/in0000051/?ref_=ints_cat_6_in_t_2"
],
[
"Crime",
"https://www.imdb.com/interest/in0000052/?ref_=ints_cat_6_in_t_3"
],
[
"Drug Crime",
"https://www.imdb.com/interest/in0000053/?ref_=ints_cat_6_in_t_4"
],
[
"Film Noir",
"https://www.imdb.com/interest/in0000054/?ref_=ints_cat_6_in_t_5"
],
[
"Gangster",
"https://www.imdb.com/interest/in0000055/?ref_=ints_cat_6_in_t_6"
],
[
"Heist",
"https://www.imdb.com/interest/in0000056/?ref_=ints_cat_6_in_t_7"
],
[
"Police Procedural",
"https://www.imdb.com/interest/in0000057/?ref_=ints_cat_6_in_t_8"
],
[
"True Crime",
"https://www.imdb.com/interest/in0000058/?ref_=ints_cat_6_in_t_9"
]
],
"Documentary": [
[
"Crime Documentary",
"https://www.imdb.com/interest/in0000059/?ref_=ints_cat_7_in_t_1"
],
[
"Documentary",
"https://www.imdb.com/interest/in0000060/?ref_=ints_cat_7_in_t_2"
],
[
"Docuseries",
"https://www.imdb.com/interest/in0000061/?ref_=ints_cat_7_in_t_3"
],
[
"Faith & Spirituality Documentary",
"https://www.imdb.com/interest/in0000062/?ref_=ints_cat_7_in_t_4"
],
[
"Food Documentary",
"https://www.imdb.com/interest/in0000063/?ref_=ints_cat_7_in_t_5"
],
[
"History Documentary",
"https://www.imdb.com/interest/in0000064/?ref_=ints_cat_7_in_t_6"
],
[
"Military Documentary",
"https://www.imdb.com/interest/in0000065/?ref_=ints_cat_7_in_t_7"
],
[
"Music Documentary",
"https://www.imdb.com/interest/in0000066/?ref_=ints_cat_7_in_t_8"
],
[
"Nature Documentary",
"https://www.imdb.com/interest/in0000067/?ref_=ints_cat_7_in_t_9"
],
[
"Political Documentary",
"https://www.imdb.com/interest/in0000068/?ref_=ints_cat_7_in_t_10"
],
[
"Science & Technology Documentary",
"https://www.imdb.com/interest/in0000069/?ref_=ints_cat_7_in_t_11"
],
[
"Sports Documentary",
"https://www.imdb.com/interest/in0000070/?ref_=ints_cat_7_in_t_12"
],
[
"Travel Documentary",
"https://www.imdb.com/interest/in0000071/?ref_=ints_cat_7_in_t_13"
]
],
"Drama": [
[
"Biography",
"https://www.imdb.com/interest/in0000072/?ref_=ints_cat_8_in_t_1"
],
[
"Coming-of-Age",
"https://www.imdb.com/interest/in0000073/?ref_=ints_cat_8_in_t_2"
],
[
"Costume Drama",
"https://www.imdb.com/interest/in0000074/?ref_=ints_cat_8_in_t_3"
],
[
"Docudrama",
"https://www.imdb.com/interest/in0000075/?ref_=ints_cat_8_in_t_4"
],
[
"Drama",
"https://www.imdb.com/interest/in0000076/?ref_=ints_cat_8_in_t_5"
],
[
"Epic",
"https://www.imdb.com/interest/in0000077/?ref_=ints_cat_8_in_t_6"
],
[
"Financial Drama",
"https://www.imdb.com/interest/in0000078/?ref_=ints_cat_8_in_t_7"
],
[
"Historical Epic",
"https://www.imdb.com/interest/in0000079/?ref_=ints_cat_8_in_t_8"
],
[
"History",
"https://www.imdb.com/interest/in0000080/?ref_=ints_cat_8_in_t_9"
],
[
"Korean Drama",
"https://www.imdb.com/interest/in0000209/?ref_=ints_cat_8_in_t_10"
],
[
"Legal Drama",
"https://www.imdb.com/interest/in0000081/?ref_=ints_cat_8_in_t_11"
],
[
"Medical Drama",
"https://www.imdb.com/interest/in0000082/?ref_=ints_cat_8_in_t_12"
],
[
"Period Drama",
"https://www.imdb.com/interest/in0000083/?ref_=ints_cat_8_in_t_13"
],
[
"Political Drama",
"https://www.imdb.com/interest/in0000084/?ref_=ints_cat_8_in_t_14"
],
[
"Prison Drama",
"https://www.imdb.com/interest/in0000085/?ref_=ints_cat_8_in_t_15"
],
[
"Psychological Drama",
"https://www.imdb.com/interest/in0000086/?ref_=ints_cat_8_in_t_16"
],
[
"Showbiz Drama",
"https://www.imdb.com/interest/in0000087/?ref_=ints_cat_8_in_t_17"
],
[
"Soap Opera",
"https://www.imdb.com/interest/in0000088/?ref_=ints_cat_8_in_t_18"
],
[
"Teen Drama",
"https://www.imdb.com/interest/in0000089/?ref_=ints_cat_8_in_t_19"
],
[
"Telenovela",
"https://www.imdb.com/interest/in0000210/?ref_=ints_cat_8_in_t_20"
],
[
"Tragedy",
"https://www.imdb.com/interest/in0000090/?ref_=ints_cat_8_in_t_21"
],
[
"Workplace Drama",
"https://www.imdb.com/interest/in0000091/?ref_=ints_cat_8_in_t_22"
]
],
"Family": [
[
"Animal Adventure",
"https://www.imdb.com/interest/in0000092/?ref_=ints_cat_9_in_t_1"
],
[
"Family",
"https://www.imdb.com/interest/in0000093/?ref_=ints_cat_9_in_t_2"
]
],
"Fantasy": [
[
"Dark Fantasy",
"https://www.imdb.com/interest/in0000095/?ref_=ints_cat_10_in_t_1"
],
[
"Fairy Tale",
"https://www.imdb.com/interest/in0000097/?ref_=ints_cat_10_in_t_2"
],
[
"Fantasy",
"https://www.imdb.com/interest/in0000098/?ref_=ints_cat_10_in_t_3"
],
[
"Fantasy Epic",
"https://www.imdb.com/interest/in0000096/?ref_=ints_cat_10_in_t_4"
],
[
"Supernatural Fantasy",
"https://www.imdb.com/interest/in0000099/?ref_=ints_cat_10_in_t_5"
],
[
"Sword & Sorcery",
"https://www.imdb.com/interest/in0000100/?ref_=ints_cat_10_in_t_6"
],
[
"Teen Fantasy",
"https://www.imdb.com/interest/in0000101/?ref_=ints_cat_10_in_t_7"
]
],
"Game Show": [
[
"Beauty Competition",
"https://www.imdb.com/interest/in0000102/?ref_=ints_cat_11_in_t_1"
],
[
"Cooking Competition",
"https://www.imdb.com/interest/in0000103/?ref_=ints_cat_11_in_t_2"
],
[
"Game Show",
"https://www.imdb.com/interest/in0000105/?ref_=ints_cat_11_in_t_3"
],
[
"Quiz Show",
"https://www.imdb.com/interest/in0000104/?ref_=ints_cat_11_in_t_4"
],
[
"Survival Competition",
"https://www.imdb.com/interest/in0000106/?ref_=ints_cat_11_in_t_5"
],
[
"Talent Competition",
"https://www.imdb.com/interest/in0000107/?ref_=ints_cat_11_in_t_6"
]
],
"Horror": [
[
"B-Horror",
"https://www.imdb.com/interest/in0000108/?ref_=ints_cat_12_in_t_1"
],
[
"Body Horror",
"https://www.imdb.com/interest/in0000109/?ref_=ints_cat_12_in_t_2"
],
[
"Folk Horror",
"https://www.imdb.com/interest/in0000110/?ref_=ints_cat_12_in_t_3"
],
[
"Found Footage Horror",
"https://www.imdb.com/interest/in0000111/?ref_=ints_cat_12_in_t_4"
],
[
"Horror",
"https://www.imdb.com/interest/in0000112/?ref_=ints_cat_12_in_t_5"
],
[
"Monster Horror",
"https://www.imdb.com/interest/in0000113/?ref_=ints_cat_12_in_t_6"
],
[
"Psychological Horror",
"https://www.imdb.com/interest/in0000114/?ref_=ints_cat_12_in_t_7"
],
[
"Slasher Horror",
"https://www.imdb.com/interest/in0000115/?ref_=ints_cat_12_in_t_8"
],
[
"Splatter Horror",
"https://www.imdb.com/interest/in0000116/?ref_=ints_cat_12_in_t_9"
],
[
"Supernatural Horror",
"https://www.imdb.com/interest/in0000117/?ref_=ints_cat_12_in_t_10"
],
[
"Teen Horror",
"https://www.imdb.com/interest/in0000118/?ref_=ints_cat_12_in_t_11"
],
[
"Vampire Horror",
"https://www.imdb.com/interest/in0000119/?ref_=ints_cat_12_in_t_12"
],
[
"Werewolf Horror",
"https://www.imdb.com/interest/in0000120/?ref_=ints_cat_12_in_t_13"
],
[
"Witch Horror",
"https://www.imdb.com/interest/in0000121/?ref_=ints_cat_12_in_t_14"
],
[
"Zombie Horror",
"https://www.imdb.com/interest/in0000122/?ref_=ints_cat_12_in_t_15"
]
],
"Lifestyle": [
[
"Beauty Makeover",
"https://www.imdb.com/interest/in0000123/?ref_=ints_cat_13_in_t_1"
],
[
"Cooking & Food",
"https://www.imdb.com/interest/in0000124/?ref_=ints_cat_13_in_t_2"
],
[
"Home Improvement",
"https://www.imdb.com/interest/in0000125/?ref_=ints_cat_13_in_t_3"
],
[
"Lifestyle",
"https://www.imdb.com/interest/in0000126/?ref_=ints_cat_13_in_t_4"
],
[
"News",
"https://www.imdb.com/interest/in0000211/?ref_=ints_cat_13_in_t_5"
],
[
"Talk Show",
"https://www.imdb.com/interest/in0000127/?ref_=ints_cat_13_in_t_6"
],
[
"Travel",
"https://www.imdb.com/interest/in0000128/?ref_=ints_cat_13_in_t_7"
]
],
"Music": [
[
"Concert",
"https://www.imdb.com/interest/in0000129/?ref_=ints_cat_14_in_t_1"
],
[
"Music",
"https://www.imdb.com/interest/in0000130/?ref_=ints_cat_14_in_t_2"
]
],
"Musical": [
[
"Classic Musical",
"https://www.imdb.com/interest/in0000131/?ref_=ints_cat_15_in_t_1"
],
[
"Jukebox Musical",
"https://www.imdb.com/interest/in0000132/?ref_=ints_cat_15_in_t_2"
],
[
"Musical",
"https://www.imdb.com/interest/in0000133/?ref_=ints_cat_15_in_t_3"
],
[
"Pop Musical",
"https://www.imdb.com/interest/in0000134/?ref_=ints_cat_15_in_t_4"
],
[
"Rock Musical",
"https://www.imdb.com/interest/in0000135/?ref_=ints_cat_15_in_t_5"
]
],
"Mystery": [
[
"Bumbling Detective",
"https://www.imdb.com/interest/in0000136/?ref_=ints_cat_16_in_t_1"
],
[
"Cozy Mystery",
"https://www.imdb.com/interest/in0000137/?ref_=ints_cat_16_in_t_2"
],
[
"Hard-boiled Detective",
"https://www.imdb.com/interest/in0000138/?ref_=ints_cat_16_in_t_3"
],
[
"Mystery",
"https://www.imdb.com/interest/in0000139/?ref_=ints_cat_16_in_t_4"
],
[
"Suspense Mystery",
"https://www.imdb.com/interest/in0000140/?ref_=ints_cat_16_in_t_5"
],
[
"Whodunnit",
"https://www.imdb.com/interest/in0000141/?ref_=ints_cat_16_in_t_6"
]
],
"Reality TV": [
[
"Business Reality TV",
"https://www.imdb.com/interest/in0000142/?ref_=ints_cat_17_in_t_1"
],
[
"Crime Reality TV",
"https://www.imdb.com/interest/in0000143/?ref_=ints_cat_17_in_t_2"
],
[
"Dating Reality TV",
"https://www.imdb.com/interest/in0000144/?ref_=ints_cat_17_in_t_3"
],
[
"Docusoap Reality TV",
"https://www.imdb.com/interest/in0000145/?ref_=ints_cat_17_in_t_4"
],
[
"Hidden Camera",
"https://www.imdb.com/interest/in0000146/?ref_=ints_cat_17_in_t_5"
],
[
"Paranormal Reality TV",
"https://www.imdb.com/interest/in0000147/?ref_=ints_cat_17_in_t_6"
],
[
"Reality TV",
"https://www.imdb.com/interest/in0000148/?ref_=ints_cat_17_in_t_7"
]
],
"Romance": [
[
"Dark Romance",
"https://www.imdb.com/interest/in0000149/?ref_=ints_cat_18_in_t_1"
],
[
"Feel-Good Romance",
"https://www.imdb.com/interest/in0000151/?ref_=ints_cat_18_in_t_2"
],
[
"Romance",
"https://www.imdb.com/interest/in0000152/?ref_=ints_cat_18_in_t_3"
],
[
"Romantic Comedy",
"https://www.imdb.com/interest/in0000153/?ref_=ints_cat_18_in_t_4"
],
[
"Romantic Epic",
"https://www.imdb.com/interest/in0000150/?ref_=ints_cat_18_in_t_5"
],
[
"Steamy Romance",
"https://www.imdb.com/interest/in0000154/?ref_=ints_cat_18_in_t_6"
],
[
"Teen Romance",
"https://www.imdb.com/interest/in0000155/?ref_=ints_cat_18_in_t_7"
],
[
"Tragic Romance",
"https://www.imdb.com/interest/in0000156/?ref_=ints_cat_18_in_t_8"
]
],
"Sci-Fi": [
[
"Alien Invasion",
"https://www.imdb.com/interest/in0000157/?ref_=ints_cat_19_in_t_1"
],
[
"Artificial Intelligence",
"https://www.imdb.com/interest/in0000158/?ref_=ints_cat_19_in_t_2"
],
[
"Cyberpunk",
"https://www.imdb.com/interest/in0000159/?ref_=ints_cat_19_in_t_3"
],
[
"Dystopian Sci-Fi",
"https://www.imdb.com/interest/in0000160/?ref_=ints_cat_19_in_t_4"
],
[
"Kaiju",
"https://www.imdb.com/interest/in0000161/?ref_=ints_cat_19_in_t_5"
],
[
"Sci-Fi",
"https://www.imdb.com/interest/in0000162/?ref_=ints_cat_19_in_t_6"
],
[
"Sci-Fi Epic",
"https://www.imdb.com/interest/in0000163/?ref_=ints_cat_19_in_t_7"
],
[
"Space Sci-Fi",
"https://www.imdb.com/interest/in0000164/?ref_=ints_cat_19_in_t_8"
],
[
"Steampunk",
"https://www.imdb.com/interest/in0000165/?ref_=ints_cat_19_in_t_9"
],
[
"Time Travel",
"https://www.imdb.com/interest/in0000166/?ref_=ints_cat_19_in_t_10"
]
],
"Seasonal": [
[
"Holiday",
"https://www.imdb.com/interest/in0000192/?ref_=ints_cat_20_in_t_1"
],
[
"Holiday Animation",
"https://www.imdb.com/interest/in0000193/?ref_=ints_cat_20_in_t_2"
],
[
"Holiday Comedy",
"https://www.imdb.com/interest/in0000194/?ref_=ints_cat_20_in_t_3"
],
[
"Holiday Family",
"https://www.imdb.com/interest/in0000195/?ref_=ints_cat_20_in_t_4"
],
[
"Holiday Romance",
"https://www.imdb.com/interest/in0000196/?ref_=ints_cat_20_in_t_5"
]
],
"Short": [
[
"Short",
"https://www.imdb.com/interest/in0000212/?ref_=ints_cat_21_in_t_1"
]
],
"Sport": [
[
"Baseball",
"https://www.imdb.com/interest/in0000167/?ref_=ints_cat_22_in_t_1"
],
[
"Basketball",
"https://www.imdb.com/interest/in0000168/?ref_=ints_cat_22_in_t_2"
],
[
"Boxing",
"https://www.imdb.com/interest/in0000169/?ref_=ints_cat_22_in_t_3"
],
[
"Extreme Sport",
"https://www.imdb.com/interest/in0000170/?ref_=ints_cat_22_in_t_4"
],
[
"Football",
"https://www.imdb.com/interest/in0000171/?ref_=ints_cat_22_in_t_5"
],
[
"Motorsport",
"https://www.imdb.com/interest/in0000172/?ref_=ints_cat_22_in_t_6"
],
[
"Soccer",
"https://www.imdb.com/interest/in0000173/?ref_=ints_cat_22_in_t_7"
],
[
"Sport",
"https://www.imdb.com/interest/in0000174/?ref_=ints_cat_22_in_t_8"
],
[
"Water Sport",
"https://www.imdb.com/interest/in0000175/?ref_=ints_cat_22_in_t_9"
]
],
"Thriller": [
[
"Conspiracy Thriller",
"https://www.imdb.com/interest/in0000176/?ref_=ints_cat_23_in_t_1"
],
[
"Cyber Thriller",
"https://www.imdb.com/interest/in0000177/?ref_=ints_cat_23_in_t_2"
],
[
"Erotic Thriller",
"https://www.imdb.com/interest/in0000178/?ref_=ints_cat_23_in_t_3"
],
[
"Giallo",
"https://www.imdb.com/interest/in0000179/?ref_=ints_cat_23_in_t_4"
],
[
"Legal Thriller",
"https://www.imdb.com/interest/in0000180/?ref_=ints_cat_23_in_t_5"
],
[
"Political Thriller",
"https://www.imdb.com/interest/in0000181/?ref_=ints_cat_23_in_t_6"
],
[
"Psychological Thriller",
"https://www.imdb.com/interest/in0000182/?ref_=ints_cat_23_in_t_7"
],
[
"Serial Killer",
"https://www.imdb.com/interest/in0000183/?ref_=ints_cat_23_in_t_8"
],
[
"Spy",
"https://www.imdb.com/interest/in0000184/?ref_=ints_cat_23_in_t_9"
],
[
"Survival",
"https://www.imdb.com/interest/in0000185/?ref_=ints_cat_23_in_t_10"
],
[
"Thriller",
"https://www.imdb.com/interest/in0000186/?ref_=ints_cat_23_in_t_11"
]
],
"Western": [
[
"Classical Western",
"https://www.imdb.com/interest/in0000187/?ref_=ints_cat_24_in_t_1"
],
[
"Contemporary Western",
"https://www.imdb.com/interest/in0000188/?ref_=ints_cat_24_in_t_2"
],
[
"Spaghetti Western",
"https://www.imdb.com/interest/in0000190/?ref_=ints_cat_24_in_t_3"
],
[
"Western",
"https://www.imdb.com/interest/in0000191/?ref_=ints_cat_24_in_t_4"
],
[
"Western Epic",
"https://www.imdb.com/interest/in0000189/?ref_=ints_cat_24_in_t_5"
]
]
}

27
data/movies_links.json Normal file
View File

@ -0,0 +1,27 @@
[
"https://www.imdb.com/title/tt0468569/",
"https://www.imdb.com/title/tt1375666/",
"https://www.imdb.com/title/tt0133093/",
"https://www.imdb.com/title/tt1345836/",
"https://www.imdb.com/title/tt0172495/",
"https://www.imdb.com/title/tt0372784/",
"https://www.imdb.com/title/tt0848228/",
"https://www.imdb.com/title/tt0076759/",
"https://www.imdb.com/title/tt0080684/",
"https://www.imdb.com/title/tt0499549/",
"https://www.imdb.com/title/tt4154796/",
"https://www.imdb.com/title/tt2015381/",
"https://www.imdb.com/title/tt0110413/",
"https://www.imdb.com/title/tt4154756/",
"https://www.imdb.com/title/tt0325980/",
"https://www.imdb.com/title/tt0266697/",
"https://www.imdb.com/title/tt0103064/",
"https://www.imdb.com/title/tt0434409/",
"https://www.imdb.com/title/tt1431045/",
"https://www.imdb.com/title/tt0371746/",
"https://www.imdb.com/title/tt0086190/",
"https://www.imdb.com/title/tt1392190/",
"https://www.imdb.com/title/tt0107290/",
"https://www.imdb.com/title/tt0082971/",
"https://www.imdb.com/title/tt1392170/"
]

View File

@ -0,0 +1,2 @@
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36'}
data_save_location="data/"

View File

@ -0,0 +1,67 @@
import requests
from bs4 import BeautifulSoup
import config
import json
# Black list for unwanted genres that
# are not actually genres but appear
# in search by naive scrapping
black_list = [
"Popular interests",
"Advanced search",
"About this page",
"Recently viewed",
]
"""
QUICK INFO:
KEY-MAIN GENRE
VALUE-LIST OF LISTS => FOR SUBGENRE IN SUBGENRES subgenre[0]-name, subgenre[1]-url
save=False - if True save to file interests.json
Get all interests from imdb https://www.imdb.com/interest/all/
interest are all types of genres,subgenres.
Return dict with genres as keys and list of lists as values:
[
[subgenre_name,subgenre_url],
[subgenre_name,subgenre_url],...
]
"""
def get_interests(save=False):
result = {}
url = "https://www.imdb.com/interest/all/"
site = requests.get(url, headers=config.headers)
soup = BeautifulSoup(site.text, 'html.parser')
# Find all sections with main interests
interest_sections = soup.find_all('section',
class_='ipc-page-section ipc-page-section--baseAlt')
for section in interest_sections:
# Find genre title (header of the section)
genre_header = section.find('h3', class_='ipc-title__text')
if genre_header:
genre = genre_header.text.strip()
if genre not in black_list:
sub_genres=[]
# cards from row under genre title
cards = section.find_all(
'a',
class_='ipc-slate-card__title ipc-slate-card__title--clickable sc-c5922af5-2 fhgilD'
)
for card in cards:
# get name of subgenre
sub_genre_name=card.text.strip()
# not necessary but better to check
if sub_genre_name not in black_list:
sub_genre_link="https://www.imdb.com"+card['href']
sub_genres.append([sub_genre_name, sub_genre_link])
result[genre] = sub_genres
if save:
with open(config.data_save_location+"interest.json", 'w') as f:
json.dump(result, f,indent=4)
else:
return result

View File

@ -0,0 +1,37 @@
import requests
from bs4 import BeautifulSoup
import config
import json
import re
def get_movies_links_for_interest(url):
site = requests.get(url, headers=config.headers)
soup=BeautifulSoup(site.text, 'html.parser')
# Find button to get only movies
button = soup.find('a', {
'class': 'ipc-chip ipc-chip--on-baseAlt',
'data-testid': 'chip-see-all-movies'
})
if button:
results=[]
# add sort by number of votes descending to get only popular movies
movies_url="https://www.imdb.com"+button['href']+"&sort=num_votes,desc"
site = requests.get(movies_url, headers=config.headers)
soup=BeautifulSoup(site.text, 'html.parser')
urls=soup.find_all(
'a'
)
for url in urls:
if url['href'].startswith("/title/tt"):
base_url=re.match(r'/title/tt[0-9]+/', url['href'])
complete_url="https://www.imdb.com"+base_url.group()
if complete_url not in results:
results.append(complete_url)
else:
return None
print(results)
print(len(results))
with open(config.data_save_location+'movies_links.json', 'w') as f:
json.dump(results, f, indent=4)
get_movies_links_for_interest("https://www.imdb.com/interest/in0000001/")