projekt_ml/reddit.py
2025-01-19 16:27:39 +01:00

93 lines
3.0 KiB
Python

import PIL.Image
import requests
from typing import Any
from settings import IMAGE_TARGET_SIZE
import PIL
import logging
from io import BytesIO
TECHNICAL_DIR = "technical"
SUBREDDITS = [
"shittytechnicals",
]
def get_top_json(subreddit: str, after: str | None = None) -> dict[str, Any]:
"""Get the top posts from a subreddit
Args:
subreddit (str): The subreddit to get the top posts from
Returns:
dict[str, Any]: The JSON response from the Reddit API
"""
url = f"https://www.reddit.com/r/{subreddit}/top.json?t=all"
headers = {"User-Agent": "Mozilla/5.0"}
params = {}
if after is not None:
params["after"] = after
response = requests.get(url, headers=headers, params=params)
response.raise_for_status()
return response.json()
def save_subreddit_images(
subreddit: str, target: str, sz: int, limit: int = 100
) -> int:
"""Save the images from the top posts of a subreddit
Args:
subreddit (str): The subreddit to get the images from
target (str): The directory to save the images to
sz (int): The size to resize the images to
limit (int, optional): The number of images to download. Defaults to 100.
Returns:
int: The number of images downloaded
"""
gotten = 0
after = None
while gotten < limit:
data = get_top_json(subreddit, after)
after = data["data"]["after"]
for post in data["data"]["children"]:
preview = post["data"].get("preview", None)
if preview is None:
continue
for image in preview["images"]:
url: str = image["source"]["url"].split("?")[0]
# resize the image
try:
response = requests.get(url)
except Exception as e:
logging.warning(f"Failed to download {url}: {e}")
continue
if not response.ok:
logging.warning(
f"Failed to download {url}: {response.status_code}, {response.reason}"
)
thumb = post["data"].get("thumbnail", None)
if thumb is None:
continue
try:
response = requests.get(thumb)
except Exception as e:
logging.warning(f"Failed to download {thumb}: {e}")
continue
if not response.ok:
logging.warning(
f"Failed to download {thumb}: {response.status_code}, {response.reason}"
)
continue
image = PIL.Image.open(BytesIO(response.content))
image = image.resize((sz, sz))
image.save(f"{target}/{url.split('/')[-1]}")
gotten += 1
return gotten
if __name__ == "__main__":
for subreddit in SUBREDDITS:
save_subreddit_images(subreddit, TECHNICAL_DIR, IMAGE_TARGET_SIZE, 512)