Compare commits
4 Commits
cc79bb9d9d
...
ba9e42b9c6
Author | SHA1 | Date | |
---|---|---|---|
ba9e42b9c6 | |||
6f3567982d | |||
d76914b4c5 | |||
4bb6619911 |
53
README.md
53
README.md
@ -2,18 +2,15 @@
|
|||||||
|
|
||||||
## Description
|
## Description
|
||||||
|
|
||||||
This project is a web scraper designed to extract data from websites. It can be customized to scrape various types of data and save it in different formats.
|
This project is a web scraper designed to extract data from websites.
|
||||||
|
|
||||||
## Features
|
## Features
|
||||||
|
|
||||||
- Extracts data from web pages
|
☑️ Extracts data from web pages
|
||||||
<!-- - Supports multiple data formats (CSV, JSON, etc.)
|
|
||||||
- Customizable scraping rules
|
|
||||||
- Error handling and logging -->
|
|
||||||
|
|
||||||
## Installation
|
## Usage
|
||||||
|
|
||||||
### Using Docker
|
### With Docker
|
||||||
|
|
||||||
1. Clone the repository:
|
1. Clone the repository:
|
||||||
|
|
||||||
@ -27,57 +24,51 @@ git clone https://git.wmi.amu.edu.pl/s500042/webscraper
|
|||||||
cd webscraper
|
cd webscraper
|
||||||
```
|
```
|
||||||
|
|
||||||
3. Build the Docker image and run it using script:
|
3. Build the Docker image and run it using `start.py` script:
|
||||||
- On Linux, ?Mac <!-- I haven't tested it yet -->
|
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./start.sh
|
python scripts/start.py
|
||||||
```
|
```
|
||||||
|
|
||||||
- Windows 🤡
|
On Mac, you'll have to use
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python start.py
|
python3 scripts/start.py
|
||||||
```
|
```
|
||||||
|
|
||||||
This one will work just fine on Linux, but on Mac, you'll have to use
|
4. Check `/app/dist/data.json` file to see the extracted data.
|
||||||
|
|
||||||
```bash
|
|
||||||
python3 start.py
|
|
||||||
```
|
|
||||||
|
|
||||||
### Without Docker
|
### Without Docker
|
||||||
|
|
||||||
1. Clone the repository:
|
1. Clone the repository:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
git clone https://github.com/yourusername/webscraper.git
|
git clone https://git.wmi.amu.edu.pl/s500042/webscraper
|
||||||
```
|
```
|
||||||
|
|
||||||
2. Navigate to the project directory:
|
2. Install the required dependencies:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
cd webscraper/app
|
pip install -r app/requirements.txt
|
||||||
```
|
|
||||||
|
|
||||||
3. Install the required dependencies:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
pip install -r requirements.txt
|
|
||||||
```
|
```
|
||||||
|
|
||||||
If you're on Arch Linux, you'll need to create a virtual environment.
|
If you're on Arch Linux, you'll need to create a virtual environment.
|
||||||
Here's is a [Step by step guide](#) that will help you create it.
|
Here's is a [Step by step guide](#) that will help you create it.
|
||||||
|
|
||||||
## Usage
|
3. Run `run_with_no_docker.py` script:
|
||||||
|
|
||||||
1. Configure the scraper by editing the `config.json` file.
|
|
||||||
2. Run the scraper:
|
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python scraper.py
|
python scripts/run_with_no_docker.py
|
||||||
```
|
```
|
||||||
|
|
||||||
|
On Mac you'll, need to use:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 scripts/run_with_no_docker.py
|
||||||
|
```
|
||||||
|
|
||||||
|
4. Check `/app/dist/data.json` file to see the extracted data.
|
||||||
|
|
||||||
## License
|
## License
|
||||||
|
|
||||||
This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details.
|
This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details.
|
||||||
|
@ -1,33 +0,0 @@
|
|||||||
import subprocess
|
|
||||||
|
|
||||||
|
|
||||||
def quitCondition(command: str) -> bool:
|
|
||||||
return command in ["q", "quit", "exit", "stop"]
|
|
||||||
|
|
||||||
|
|
||||||
def helpCondition(command: str) -> bool:
|
|
||||||
return command in ["h", "help"]
|
|
||||||
|
|
||||||
|
|
||||||
def clearCondition(command: str) -> bool:
|
|
||||||
return command in ["c", "clear", "cls"]
|
|
||||||
|
|
||||||
|
|
||||||
def systemCommand(command: str) -> str:
|
|
||||||
words = command[slice(1, len(command))].split()
|
|
||||||
if words[0] == "":
|
|
||||||
return "Command not found. Write 'h' for help."
|
|
||||||
return subprocess.run(
|
|
||||||
f'docker exec -it webscraper {" ".join(words)}',
|
|
||||||
shell=True,
|
|
||||||
stdout=subprocess.PIPE,
|
|
||||||
stderr=subprocess.PIPE,
|
|
||||||
).stdout.decode()
|
|
||||||
|
|
||||||
|
|
||||||
def restartCondition(command: str) -> bool:
|
|
||||||
return command in ["r", "restart"]
|
|
||||||
|
|
||||||
|
|
||||||
def runCondition(command: str) -> bool:
|
|
||||||
return command in ["run"]
|
|
30
scripts/get_path.py
Normal file
30
scripts/get_path.py
Normal file
@ -0,0 +1,30 @@
|
|||||||
|
import os
|
||||||
|
from run_command import run_command
|
||||||
|
|
||||||
|
|
||||||
|
def get_path():
|
||||||
|
pwd = run_command("pwd")
|
||||||
|
splitted = pwd.split("/")
|
||||||
|
splitted[-1] = splitted[-1].replace("\n", "")
|
||||||
|
|
||||||
|
print(splitted[: splitted.index("webscraper") + 1])
|
||||||
|
|
||||||
|
if splitted.count("webscraper") > 1 and "webscraper" in splitted:
|
||||||
|
for i in range(len(splitted) - 1, -1, -1):
|
||||||
|
potential_path = "/".join(splitted[: i + 1])
|
||||||
|
if "webscraper" in potential_path:
|
||||||
|
script_path = f"{potential_path}/scripts"
|
||||||
|
if os.path.isdir(script_path):
|
||||||
|
return potential_path
|
||||||
|
else:
|
||||||
|
return "This is not a valid webscraper project."
|
||||||
|
else:
|
||||||
|
return "/".join(splitted[: splitted.index("webscraper") + 1])
|
||||||
|
|
||||||
|
|
||||||
|
def run_main():
|
||||||
|
print(get_path())
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
run_main()
|
13
scripts/run_command.py
Normal file
13
scripts/run_command.py
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
|
||||||
|
|
||||||
|
def run_command(command: str) -> str:
|
||||||
|
process = subprocess.run(
|
||||||
|
command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
|
||||||
|
)
|
||||||
|
if process.returncode != 0:
|
||||||
|
print(f"Error running command: {command}")
|
||||||
|
print(process.stderr.decode())
|
||||||
|
sys.exit(process.returncode)
|
||||||
|
return process.stdout.decode()
|
17
scripts/run_with_no_docker.py
Normal file
17
scripts/run_with_no_docker.py
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
import subprocess
|
||||||
|
import os
|
||||||
|
from run_command import run_command
|
||||||
|
from get_path import get_path
|
||||||
|
|
||||||
|
|
||||||
|
def run_main():
|
||||||
|
path = get_path()
|
||||||
|
try:
|
||||||
|
result = run_command(f"python3 {path}/app/main.py")
|
||||||
|
print(result)
|
||||||
|
except subprocess.CalledProcessError as e:
|
||||||
|
print(f"Error occurred: {e.stderr}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
run_main()
|
@ -1,31 +1,25 @@
|
|||||||
import subprocess
|
import subprocess
|
||||||
import os
|
import os
|
||||||
import sys
|
from run_command import run_command
|
||||||
|
|
||||||
|
|
||||||
def run_command(command: str) -> str:
|
|
||||||
process = subprocess.run(
|
|
||||||
command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
|
|
||||||
)
|
|
||||||
if process.returncode != 0:
|
|
||||||
print(f"Error running command: {command}")
|
|
||||||
print(process.stderr.decode())
|
|
||||||
sys.exit(process.returncode)
|
|
||||||
return process.stdout.decode()
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
docker_compose_file = os.getenv("DOCKER_COMPOSE_FILE", "./app/docker-compose.yaml")
|
docker_compose_file = os.getenv(
|
||||||
|
"DOCKER_COMPOSE_FILE", "$WEBSCRAPER/app/docker-compose.yaml"
|
||||||
|
)
|
||||||
service_name = os.getenv("SERVICE_NAME", "webscraper")
|
service_name = os.getenv("SERVICE_NAME", "webscraper")
|
||||||
script_name = os.getenv("SCRIPT_NAME", "main.py")
|
script_name = os.getenv("SCRIPT_NAME", "main.py")
|
||||||
|
try:
|
||||||
|
print("Starting Docker Compose services...\n")
|
||||||
|
run_command(f"docker compose -f {docker_compose_file} up -d")
|
||||||
|
|
||||||
print("Starting Docker Compose services...\n")
|
print(run_command(f"docker exec {service_name} python {script_name}"))
|
||||||
run_command(f"docker compose -f {docker_compose_file} up -d")
|
|
||||||
|
|
||||||
print(run_command(f"docker exec {service_name} python {script_name}"))
|
print("Stopping and removing Docker Compose services...")
|
||||||
|
run_command(f"docker compose -f {docker_compose_file} down")
|
||||||
print("Stopping and removing Docker Compose services...")
|
except subprocess.CalledProcessError as e:
|
||||||
run_command(f"docker compose -f {docker_compose_file} down")
|
print("An error occurred while running the script.")
|
||||||
|
print(e)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
57
scripts/threads/commands.py
Normal file
57
scripts/threads/commands.py
Normal file
@ -0,0 +1,57 @@
|
|||||||
|
from run_command import run_command
|
||||||
|
|
||||||
|
|
||||||
|
def quitCondition(command: str) -> bool:
|
||||||
|
return command in ["q", "quit", "exit", "stop"]
|
||||||
|
|
||||||
|
|
||||||
|
def quitService(path: str):
|
||||||
|
print("Stopping and removing Docker Compose services...")
|
||||||
|
run_command(f"docker compose -f {path}/app/docker-compose.yaml down")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def helpCondition(command: str) -> bool:
|
||||||
|
return command in ["h", "help"]
|
||||||
|
|
||||||
|
|
||||||
|
def clearCondition(command: str) -> bool:
|
||||||
|
return command in ["c", "clear", "cls"]
|
||||||
|
|
||||||
|
|
||||||
|
def systemCommand(command: str) -> str:
|
||||||
|
words = command[1:].split()
|
||||||
|
if words[0] == "":
|
||||||
|
return "Command not found. Write 'h' for help."
|
||||||
|
print(
|
||||||
|
run_command(
|
||||||
|
f'docker exec -it webscraper {" ".join(words)}',
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def restartCondition(command: str) -> bool:
|
||||||
|
return command in ["r", "restart"]
|
||||||
|
|
||||||
|
|
||||||
|
def restartService(path: str):
|
||||||
|
print("Restarting Docker Compose services...")
|
||||||
|
run_command(f"docker compose -f {path}/app/docker-compose.yaml down")
|
||||||
|
run_command(f"docker compose -f {path}/app/docker-compose.yaml up -d")
|
||||||
|
print("Composed!")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def runCondition(command: str) -> bool:
|
||||||
|
return command in ["run"]
|
||||||
|
|
||||||
|
|
||||||
|
def runService():
|
||||||
|
print("Running main.py...")
|
||||||
|
print(
|
||||||
|
run_command(
|
||||||
|
"docker exec -it webscraper python main.py",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return None
|
9
scripts/threads/help_list.py
Normal file
9
scripts/threads/help_list.py
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
def help_list():
|
||||||
|
return """
|
||||||
|
["h", "help"], - for help.
|
||||||
|
["q", "quit", "exit", "stop"], - to stop program.
|
||||||
|
["c", "clear", "cls"], - to clear console.
|
||||||
|
["r", "restart"], - to restart Docker Compose services.
|
||||||
|
["run"], - to run main.py in docker container.
|
||||||
|
["$..."], - to evaluate command in docker container.
|
||||||
|
"""
|
@ -1,65 +1,30 @@
|
|||||||
import commands
|
|
||||||
import subprocess
|
|
||||||
import sys
|
import sys
|
||||||
import threading
|
from threads.commands import *
|
||||||
|
from run_command import run_command
|
||||||
|
from get_path import get_path
|
||||||
|
from threads.help_list import help_list
|
||||||
|
|
||||||
|
|
||||||
def prompt():
|
def prompt():
|
||||||
while True:
|
while True:
|
||||||
command = input("> ")
|
command = input("> ")
|
||||||
if commands.quitCondition(command):
|
if quitCondition(command):
|
||||||
print("Stopping and removing Docker Compose services...")
|
quitService(get_path())
|
||||||
subprocess.run(
|
|
||||||
"docker compose -f ../app/docker-compose.yaml down",
|
|
||||||
shell=True,
|
|
||||||
stdout=subprocess.PIPE,
|
|
||||||
stderr=subprocess.PIPE,
|
|
||||||
)
|
|
||||||
break
|
break
|
||||||
if commands.helpCondition(command):
|
if helpCondition(command):
|
||||||
print(
|
print(help_list())
|
||||||
"""
|
|
||||||
["h", "help"], - for help.
|
|
||||||
["q", "quit", "exit", "stop"], - to stop program.
|
|
||||||
["c", "clear", "cls"], - to clear console.
|
|
||||||
["r", "restart"], - to restart Docker Compose services.
|
|
||||||
["run"], - to run main.py in docker container.
|
|
||||||
["$..."], - to evaluate command in docker container.
|
|
||||||
"""
|
|
||||||
)
|
|
||||||
continue
|
continue
|
||||||
if commands.clearCondition(command):
|
if clearCondition(command):
|
||||||
print("\n" * 100)
|
run_command("clear")
|
||||||
continue
|
continue
|
||||||
if command.startswith("$"):
|
if command.startswith("$"):
|
||||||
print(commands.systemCommand(command))
|
systemCommand(command)
|
||||||
continue
|
continue
|
||||||
if commands.restartCondition(command):
|
if restartCondition(command):
|
||||||
print("Restarting Docker Compose services...")
|
restartService(get_path())
|
||||||
subprocess.run(
|
|
||||||
"docker compose -f ../app/docker-compose.yaml down",
|
|
||||||
shell=True,
|
|
||||||
stdout=subprocess.PIPE,
|
|
||||||
stderr=subprocess.PIPE,
|
|
||||||
)
|
|
||||||
subprocess.run(
|
|
||||||
"docker compose -f ../app/docker-compose.yaml up -d",
|
|
||||||
shell=True,
|
|
||||||
stdout=subprocess.PIPE,
|
|
||||||
stderr=subprocess.PIPE,
|
|
||||||
)
|
|
||||||
print("Composed!")
|
|
||||||
continue
|
continue
|
||||||
if commands.runCondition(command):
|
if runCondition(command):
|
||||||
print("Running main.py...")
|
runService()
|
||||||
print(
|
|
||||||
subprocess.run(
|
|
||||||
"docker exec -it webscraper python main.py",
|
|
||||||
shell=True,
|
|
||||||
stdout=subprocess.PIPE,
|
|
||||||
stderr=subprocess.PIPE,
|
|
||||||
).stdout.decode()
|
|
||||||
)
|
|
||||||
continue
|
continue
|
||||||
if command == "":
|
if command == "":
|
||||||
continue
|
continue
|
||||||
|
@ -1,26 +1,20 @@
|
|||||||
import subprocess
|
|
||||||
import time
|
import time
|
||||||
import os
|
import os
|
||||||
import threading
|
import threading
|
||||||
from threads.prompt import prompt
|
from threads.prompt import prompt
|
||||||
|
from run_command import run_command
|
||||||
|
from get_path import get_path
|
||||||
def run_command(command: str) -> str:
|
|
||||||
process = subprocess.run(
|
|
||||||
command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
|
|
||||||
)
|
|
||||||
if process.returncode != 0:
|
|
||||||
print(f"Error running command: {command}")
|
|
||||||
return process.stderr.decode()
|
|
||||||
return process.stdout.decode()
|
|
||||||
|
|
||||||
|
|
||||||
thread = threading.Thread(target=prompt)
|
thread = threading.Thread(target=prompt)
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
path = get_path()
|
||||||
|
docker_compose_file = os.getenv(
|
||||||
|
"DOCKER_COMPOSE_FILE", f"{path}/app/docker-compose.yaml"
|
||||||
|
)
|
||||||
print("Starting Docker Compose services...")
|
print("Starting Docker Compose services...")
|
||||||
run_command("docker compose -f ../app/docker-compose.yaml up -d")
|
run_command(f"docker compose -f {docker_compose_file} up -d")
|
||||||
print("Composed!\n")
|
print("Composed!\n")
|
||||||
print("Running main.py...")
|
print("Running main.py...")
|
||||||
print(run_command("docker exec -it webscraper python main.py"))
|
print(run_command("docker exec -it webscraper python main.py"))
|
||||||
@ -31,10 +25,9 @@ def main():
|
|||||||
print("\nWatching for changes...")
|
print("\nWatching for changes...")
|
||||||
thread.start()
|
thread.start()
|
||||||
|
|
||||||
path_to_watch = "/home/paprykdev/uni/webscraper/app"
|
|
||||||
before = {
|
before = {
|
||||||
f: os.stat(os.path.join(path_to_watch, f)).st_mtime
|
f: os.stat(os.path.join(path, "app", f)).st_mtime
|
||||||
for f in os.listdir(path_to_watch)
|
for f in os.listdir(os.path.join(path, "app"))
|
||||||
if f.endswith(".py")
|
if f.endswith(".py")
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -43,8 +36,8 @@ def main():
|
|||||||
break
|
break
|
||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
after = {
|
after = {
|
||||||
f: os.stat(os.path.join(path_to_watch, f)).st_mtime
|
f: os.stat(os.path.join(path, "app", f)).st_mtime
|
||||||
for f in os.listdir(path_to_watch)
|
for f in os.listdir(os.path.join(path, "app"))
|
||||||
if f.endswith(".py")
|
if f.endswith(".py")
|
||||||
}
|
}
|
||||||
for f in before:
|
for f in before:
|
||||||
|
Loading…
Reference in New Issue
Block a user