Compare commits

...

4 Commits

Author SHA1 Message Date
04b9a5cd0c
docs: update README.md 2024-11-12 05:17:45 +01:00
a069139573
docs: add MIT LICENSE 2024-11-12 05:17:24 +01:00
c5b61893ac
feat: add docker integration 2024-11-12 05:17:00 +01:00
b25f04d218
feat: initial commit 2024-11-12 05:16:21 +01:00
11 changed files with 306 additions and 0 deletions

26
.dockerignore Normal file
View File

@ -0,0 +1,26 @@
# Ignore Python cache and compiled files
__pycache__/
*.py[cod]
*.pyo
# Ignore virtual environment directories
.venv/
venv/
env/
# Ignore logs
logs/
*.log
# Ignore environment variables
.env
# Ignore Docker files
Dockerfile
docker-compose.yml
# Ignore build directories
dist/
build/
# Ignore any other files or directories you want to exclude

32
.gitignore vendored Normal file
View File

@ -0,0 +1,32 @@
# Ignore Python bytecode files
*.pyc
*.pyo
__pycache__/
# Ignore virtual environment directories
.venv/
venv/
env/
# Ignore system files
.DS_Store
Thumbs.db
# Ignore log files
*.log
# Ignore temporary files
*.tmp
*.swp
# Ignore output files
dist/
build/
*.egg-info/
# Ignore environment files
.env
# IDE files
.idea/
.vscode/

21
LICENSE Normal file
View File

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2024 Patryk Ilkiw
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@ -0,0 +1,83 @@
# Web scraper 🔍
## Description
This project is a web scraper designed to extract data from websites. It can be customized to scrape various types of data and save it in different formats.
## Features
- Extracts data from web pages
<!-- - Supports multiple data formats (CSV, JSON, etc.)
- Customizable scraping rules
- Error handling and logging -->
## Installation
### Using Docker
1. Clone the repository:
```bash
git clone https://git.wmi.amu.edu.pl/s500042/webscraper
```
2. Navigate to the project directory:
```bash
cd webscraper
```
3. Build the Docker image and run it using script:
- On Linux, ?Mac <!-- I haven't tested it yet -->
```bash
./start.sh
```
- Windows 🤡
```bash
python start.py
```
This one will work just fine on Linux, but on Mac, you'll have to use
```bash
python3 start.py
```
### Without Docker
1. Clone the repository:
```bash
git clone https://github.com/yourusername/webscraper.git
```
2. Navigate to the project directory:
```bash
cd webscraper/app
```
3. Install the required dependencies:
```bash
pip install -r requirements.txt
```
If you're on Arch Linux, you'll need to create a virtual environment.
Here's is a [Step by step guide](#) that will help you create it.
## Usage
1. Configure the scraper by editing the `config.json` file.
2. Run the scraper:
```bash
python scraper.py
```
## License
This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details.

26
app/Dockerfile Normal file
View File

@ -0,0 +1,26 @@
FROM python:3.9-slim
WORKDIR /usr/src/app
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
COPY . .
RUN apt-get update && apt-get install -y \
wget \
unzip \
curl \
docker \
libx11-dev \
libgdk-pixbuf2.0-0 \
libcanberra-gtk-module \
&& wget https://chromedriver.storage.googleapis.com/114.0.5735.90/chromedriver_linux64.zip \
&& unzip chromedriver_linux64.zip \
&& mv chromedriver /usr/local/bin/ \
&& chmod +x /usr/local/bin/chromedriver
# RUN chown -R python /usr/src/app
# USER python
CMD ["python", "main.py"]

13
app/main.py Normal file
View File

@ -0,0 +1,13 @@
from scraper import scraper
import time
def main():
print("Starting the application...\n\n")
scraper()
print("\n\nApplication finished!")
time.sleep(8)
if __name__ == "__main__":
main()

8
app/requirements.txt Normal file
View File

@ -0,0 +1,8 @@
beautifulsoup4==4.12.3
bs4==0.0.2
certifi==2024.8.30
charset-normalizer==3.4.0
idna==3.10
requests==2.32.3
soupsieve==2.6
urllib3==2.2.3

17
app/scraper.py Normal file
View File

@ -0,0 +1,17 @@
import os
import json
def scraper():
directory = "dist"
file_path = os.path.join(directory, "data.json")
data = []
try:
os.mkdir("dist")
except FileExistsError:
pass
with open(file_path, "w", encoding="utf-8") as file:
json.dump(data, file)
print("Data has been scraped!")

26
docker-compose.yaml Normal file
View File

@ -0,0 +1,26 @@
services:
webscraper:
build:
context: ./app
container_name: webscraper
depends_on:
- redis
volumes:
- ./app:/usr/src/app
develop:
watch:
- path: ./app/requirements.txt
action: rebuild
- path: ./app
target: /usr/src/app
action: sync
redis:
image: "redis:alpine"
volumes:
- redis_data:/data
ports:
- "6379:6379"
volumes:
redis_data:
app:

47
start.py Normal file
View File

@ -0,0 +1,47 @@
import subprocess
import time
def run_command(command):
result = subprocess.run(
command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
)
if result.returncode != 0:
print(f"Error running command: {command}")
print(result.stderr.decode())
return result.stdout.decode()
def wait_for_webscraper():
while True:
result = run_command("docker compose ps -q webscraper")
container_id = result.strip()
if not container_id:
print("Webscraper container not found.")
break
status = run_command(
f"docker inspect --format '{{.State.Status}}' {container_id}"
)
if status.strip() == "exited":
print("Webscraper has finished.")
break
print("Waiting for webscraper to finish...")
time.sleep(3)
def main():
print("Starting Docker Compose services...")
run_command("docker compose up -d")
wait_for_webscraper()
print("Stopping and removing Docker Compose services...")
run_command("docker compose down")
if __name__ == "__main__":
main()

7
start.sh Executable file
View File

@ -0,0 +1,7 @@
#!/bin/bash
docker compose up -d
docker compose wait webscraper > /dev/null
docker compose down