Compare commits

..

4 Commits

10 changed files with 187 additions and 151 deletions

View File

@ -2,18 +2,15 @@
## Description ## Description
This project is a web scraper designed to extract data from websites. It can be customized to scrape various types of data and save it in different formats. This project is a web scraper designed to extract data from websites.
## Features ## Features
- Extracts data from web pages ☑️ Extracts data from web pages
<!-- - Supports multiple data formats (CSV, JSON, etc.)
- Customizable scraping rules
- Error handling and logging -->
## Installation ## Usage
### Using Docker ### With Docker
1. Clone the repository: 1. Clone the repository:
@ -27,57 +24,51 @@ git clone https://git.wmi.amu.edu.pl/s500042/webscraper
cd webscraper cd webscraper
``` ```
3. Build the Docker image and run it using script: 3. Build the Docker image and run it using `start.py` script:
- On Linux, ?Mac <!-- I haven't tested it yet -->
```bash ```bash
./start.sh python scripts/start.py
``` ```
- Windows 🤡 On Mac, you'll have to use
```bash ```bash
python start.py python3 scripts/start.py
``` ```
This one will work just fine on Linux, but on Mac, you'll have to use 4. Check `/app/dist/data.json` file to see the extracted data.
```bash
python3 start.py
```
### Without Docker ### Without Docker
1. Clone the repository: 1. Clone the repository:
```bash ```bash
git clone https://github.com/yourusername/webscraper.git git clone https://git.wmi.amu.edu.pl/s500042/webscraper
``` ```
2. Navigate to the project directory: 2. Install the required dependencies:
```bash ```bash
cd webscraper/app pip install -r app/requirements.txt
```
3. Install the required dependencies:
```bash
pip install -r requirements.txt
``` ```
If you're on Arch Linux, you'll need to create a virtual environment. If you're on Arch Linux, you'll need to create a virtual environment.
Here's is a [Step by step guide](#) that will help you create it. Here's is a [Step by step guide](#) that will help you create it.
## Usage 3. Run `run_with_no_docker.py` script:
1. Configure the scraper by editing the `config.json` file.
2. Run the scraper:
```bash ```bash
python scraper.py python scripts/run_with_no_docker.py
``` ```
On Mac you'll, need to use:
```bash
python3 scripts/run_with_no_docker.py
```
4. Check `/app/dist/data.json` file to see the extracted data.
## License ## License
This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details. This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details.

View File

@ -1,33 +0,0 @@
import subprocess
def quitCondition(command: str) -> bool:
return command in ["q", "quit", "exit", "stop"]
def helpCondition(command: str) -> bool:
return command in ["h", "help"]
def clearCondition(command: str) -> bool:
return command in ["c", "clear", "cls"]
def systemCommand(command: str) -> str:
words = command[slice(1, len(command))].split()
if words[0] == "":
return "Command not found. Write 'h' for help."
return subprocess.run(
f'docker exec -it webscraper {" ".join(words)}',
shell=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
).stdout.decode()
def restartCondition(command: str) -> bool:
return command in ["r", "restart"]
def runCondition(command: str) -> bool:
return command in ["run"]

30
scripts/get_path.py Normal file
View File

@ -0,0 +1,30 @@
import os
from run_command import run_command
def get_path():
pwd = run_command("pwd")
splitted = pwd.split("/")
splitted[-1] = splitted[-1].replace("\n", "")
print(splitted[: splitted.index("webscraper") + 1])
if splitted.count("webscraper") > 1 and "webscraper" in splitted:
for i in range(len(splitted) - 1, -1, -1):
potential_path = "/".join(splitted[: i + 1])
if "webscraper" in potential_path:
script_path = f"{potential_path}/scripts"
if os.path.isdir(script_path):
return potential_path
else:
return "This is not a valid webscraper project."
else:
return "/".join(splitted[: splitted.index("webscraper") + 1])
def run_main():
print(get_path())
if __name__ == "__main__":
run_main()

13
scripts/run_command.py Normal file
View File

@ -0,0 +1,13 @@
import subprocess
import sys
def run_command(command: str) -> str:
process = subprocess.run(
command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
)
if process.returncode != 0:
print(f"Error running command: {command}")
print(process.stderr.decode())
sys.exit(process.returncode)
return process.stdout.decode()

View File

@ -0,0 +1,17 @@
import subprocess
import os
from run_command import run_command
from get_path import get_path
def run_main():
path = get_path()
try:
result = run_command(f"python3 {path}/app/main.py")
print(result)
except subprocess.CalledProcessError as e:
print(f"Error occurred: {e.stderr}")
if __name__ == "__main__":
run_main()

View File

@ -1,31 +1,25 @@
import subprocess import subprocess
import os import os
import sys from run_command import run_command
def run_command(command: str) -> str:
process = subprocess.run(
command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
)
if process.returncode != 0:
print(f"Error running command: {command}")
print(process.stderr.decode())
sys.exit(process.returncode)
return process.stdout.decode()
def main(): def main():
docker_compose_file = os.getenv("DOCKER_COMPOSE_FILE", "./app/docker-compose.yaml") docker_compose_file = os.getenv(
"DOCKER_COMPOSE_FILE", "$WEBSCRAPER/app/docker-compose.yaml"
)
service_name = os.getenv("SERVICE_NAME", "webscraper") service_name = os.getenv("SERVICE_NAME", "webscraper")
script_name = os.getenv("SCRIPT_NAME", "main.py") script_name = os.getenv("SCRIPT_NAME", "main.py")
try:
print("Starting Docker Compose services...\n")
run_command(f"docker compose -f {docker_compose_file} up -d")
print("Starting Docker Compose services...\n") print(run_command(f"docker exec {service_name} python {script_name}"))
run_command(f"docker compose -f {docker_compose_file} up -d")
print(run_command(f"docker exec {service_name} python {script_name}")) print("Stopping and removing Docker Compose services...")
run_command(f"docker compose -f {docker_compose_file} down")
print("Stopping and removing Docker Compose services...") except subprocess.CalledProcessError as e:
run_command(f"docker compose -f {docker_compose_file} down") print("An error occurred while running the script.")
print(e)
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -0,0 +1,57 @@
from run_command import run_command
def quitCondition(command: str) -> bool:
return command in ["q", "quit", "exit", "stop"]
def quitService(path: str):
print("Stopping and removing Docker Compose services...")
run_command(f"docker compose -f {path}/app/docker-compose.yaml down")
return None
def helpCondition(command: str) -> bool:
return command in ["h", "help"]
def clearCondition(command: str) -> bool:
return command in ["c", "clear", "cls"]
def systemCommand(command: str) -> str:
words = command[1:].split()
if words[0] == "":
return "Command not found. Write 'h' for help."
print(
run_command(
f'docker exec -it webscraper {" ".join(words)}',
)
)
return None
def restartCondition(command: str) -> bool:
return command in ["r", "restart"]
def restartService(path: str):
print("Restarting Docker Compose services...")
run_command(f"docker compose -f {path}/app/docker-compose.yaml down")
run_command(f"docker compose -f {path}/app/docker-compose.yaml up -d")
print("Composed!")
return None
def runCondition(command: str) -> bool:
return command in ["run"]
def runService():
print("Running main.py...")
print(
run_command(
"docker exec -it webscraper python main.py",
)
)
return None

View File

@ -0,0 +1,9 @@
def help_list():
return """
["h", "help"], - for help.
["q", "quit", "exit", "stop"], - to stop program.
["c", "clear", "cls"], - to clear console.
["r", "restart"], - to restart Docker Compose services.
["run"], - to run main.py in docker container.
["$..."], - to evaluate command in docker container.
"""

View File

@ -1,65 +1,30 @@
import commands
import subprocess
import sys import sys
import threading from threads.commands import *
from run_command import run_command
from get_path import get_path
from threads.help_list import help_list
def prompt(): def prompt():
while True: while True:
command = input("> ") command = input("> ")
if commands.quitCondition(command): if quitCondition(command):
print("Stopping and removing Docker Compose services...") quitService(get_path())
subprocess.run(
"docker compose -f ../app/docker-compose.yaml down",
shell=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
break break
if commands.helpCondition(command): if helpCondition(command):
print( print(help_list())
"""
["h", "help"], - for help.
["q", "quit", "exit", "stop"], - to stop program.
["c", "clear", "cls"], - to clear console.
["r", "restart"], - to restart Docker Compose services.
["run"], - to run main.py in docker container.
["$..."], - to evaluate command in docker container.
"""
)
continue continue
if commands.clearCondition(command): if clearCondition(command):
print("\n" * 100) run_command("clear")
continue continue
if command.startswith("$"): if command.startswith("$"):
print(commands.systemCommand(command)) systemCommand(command)
continue continue
if commands.restartCondition(command): if restartCondition(command):
print("Restarting Docker Compose services...") restartService(get_path())
subprocess.run(
"docker compose -f ../app/docker-compose.yaml down",
shell=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
subprocess.run(
"docker compose -f ../app/docker-compose.yaml up -d",
shell=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
print("Composed!")
continue continue
if commands.runCondition(command): if runCondition(command):
print("Running main.py...") runService()
print(
subprocess.run(
"docker exec -it webscraper python main.py",
shell=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
).stdout.decode()
)
continue continue
if command == "": if command == "":
continue continue

View File

@ -1,26 +1,20 @@
import subprocess
import time import time
import os import os
import threading import threading
from threads.prompt import prompt from threads.prompt import prompt
from run_command import run_command
from get_path import get_path
def run_command(command: str) -> str:
process = subprocess.run(
command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
)
if process.returncode != 0:
print(f"Error running command: {command}")
return process.stderr.decode()
return process.stdout.decode()
thread = threading.Thread(target=prompt) thread = threading.Thread(target=prompt)
def main(): def main():
path = get_path()
docker_compose_file = os.getenv(
"DOCKER_COMPOSE_FILE", f"{path}/app/docker-compose.yaml"
)
print("Starting Docker Compose services...") print("Starting Docker Compose services...")
run_command("docker compose -f ../app/docker-compose.yaml up -d") run_command(f"docker compose -f {docker_compose_file} up -d")
print("Composed!\n") print("Composed!\n")
print("Running main.py...") print("Running main.py...")
print(run_command("docker exec -it webscraper python main.py")) print(run_command("docker exec -it webscraper python main.py"))
@ -31,10 +25,9 @@ def main():
print("\nWatching for changes...") print("\nWatching for changes...")
thread.start() thread.start()
path_to_watch = "/home/paprykdev/uni/webscraper/app"
before = { before = {
f: os.stat(os.path.join(path_to_watch, f)).st_mtime f: os.stat(os.path.join(path, "app", f)).st_mtime
for f in os.listdir(path_to_watch) for f in os.listdir(os.path.join(path, "app"))
if f.endswith(".py") if f.endswith(".py")
} }
@ -43,8 +36,8 @@ def main():
break break
time.sleep(1) time.sleep(1)
after = { after = {
f: os.stat(os.path.join(path_to_watch, f)).st_mtime f: os.stat(os.path.join(path, "app", f)).st_mtime
for f in os.listdir(path_to_watch) for f in os.listdir(os.path.join(path, "app"))
if f.endswith(".py") if f.endswith(".py")
} }
for f in before: for f in before: