Compare commits
No commits in common. "ba9e42b9c6f15b5a237a2bd824c2bdc4bbe74ace" and "cc79bb9d9df67407ae026e0720df890bf62013cd" have entirely different histories.
ba9e42b9c6
...
cc79bb9d9d
53
README.md
53
README.md
@ -2,15 +2,18 @@
|
|||||||
|
|
||||||
## Description
|
## Description
|
||||||
|
|
||||||
This project is a web scraper designed to extract data from websites.
|
This project is a web scraper designed to extract data from websites. It can be customized to scrape various types of data and save it in different formats.
|
||||||
|
|
||||||
## Features
|
## Features
|
||||||
|
|
||||||
☑️ Extracts data from web pages
|
- Extracts data from web pages
|
||||||
|
<!-- - Supports multiple data formats (CSV, JSON, etc.)
|
||||||
|
- Customizable scraping rules
|
||||||
|
- Error handling and logging -->
|
||||||
|
|
||||||
## Usage
|
## Installation
|
||||||
|
|
||||||
### With Docker
|
### Using Docker
|
||||||
|
|
||||||
1. Clone the repository:
|
1. Clone the repository:
|
||||||
|
|
||||||
@ -24,51 +27,57 @@ git clone https://git.wmi.amu.edu.pl/s500042/webscraper
|
|||||||
cd webscraper
|
cd webscraper
|
||||||
```
|
```
|
||||||
|
|
||||||
3. Build the Docker image and run it using `start.py` script:
|
3. Build the Docker image and run it using script:
|
||||||
|
- On Linux, ?Mac <!-- I haven't tested it yet -->
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python scripts/start.py
|
./start.sh
|
||||||
```
|
```
|
||||||
|
|
||||||
On Mac, you'll have to use
|
- Windows 🤡
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python3 scripts/start.py
|
python start.py
|
||||||
```
|
```
|
||||||
|
|
||||||
4. Check `/app/dist/data.json` file to see the extracted data.
|
This one will work just fine on Linux, but on Mac, you'll have to use
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 start.py
|
||||||
|
```
|
||||||
|
|
||||||
### Without Docker
|
### Without Docker
|
||||||
|
|
||||||
1. Clone the repository:
|
1. Clone the repository:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
git clone https://git.wmi.amu.edu.pl/s500042/webscraper
|
git clone https://github.com/yourusername/webscraper.git
|
||||||
```
|
```
|
||||||
|
|
||||||
2. Install the required dependencies:
|
2. Navigate to the project directory:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
pip install -r app/requirements.txt
|
cd webscraper/app
|
||||||
|
```
|
||||||
|
|
||||||
|
3. Install the required dependencies:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install -r requirements.txt
|
||||||
```
|
```
|
||||||
|
|
||||||
If you're on Arch Linux, you'll need to create a virtual environment.
|
If you're on Arch Linux, you'll need to create a virtual environment.
|
||||||
Here's is a [Step by step guide](#) that will help you create it.
|
Here's is a [Step by step guide](#) that will help you create it.
|
||||||
|
|
||||||
3. Run `run_with_no_docker.py` script:
|
## Usage
|
||||||
|
|
||||||
|
1. Configure the scraper by editing the `config.json` file.
|
||||||
|
2. Run the scraper:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python scripts/run_with_no_docker.py
|
python scraper.py
|
||||||
```
|
```
|
||||||
|
|
||||||
On Mac you'll, need to use:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python3 scripts/run_with_no_docker.py
|
|
||||||
```
|
|
||||||
|
|
||||||
4. Check `/app/dist/data.json` file to see the extracted data.
|
|
||||||
|
|
||||||
## License
|
## License
|
||||||
|
|
||||||
This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details.
|
This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details.
|
||||||
|
33
scripts/commands.py
Normal file
33
scripts/commands.py
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
import subprocess
|
||||||
|
|
||||||
|
|
||||||
|
def quitCondition(command: str) -> bool:
|
||||||
|
return command in ["q", "quit", "exit", "stop"]
|
||||||
|
|
||||||
|
|
||||||
|
def helpCondition(command: str) -> bool:
|
||||||
|
return command in ["h", "help"]
|
||||||
|
|
||||||
|
|
||||||
|
def clearCondition(command: str) -> bool:
|
||||||
|
return command in ["c", "clear", "cls"]
|
||||||
|
|
||||||
|
|
||||||
|
def systemCommand(command: str) -> str:
|
||||||
|
words = command[slice(1, len(command))].split()
|
||||||
|
if words[0] == "":
|
||||||
|
return "Command not found. Write 'h' for help."
|
||||||
|
return subprocess.run(
|
||||||
|
f'docker exec -it webscraper {" ".join(words)}',
|
||||||
|
shell=True,
|
||||||
|
stdout=subprocess.PIPE,
|
||||||
|
stderr=subprocess.PIPE,
|
||||||
|
).stdout.decode()
|
||||||
|
|
||||||
|
|
||||||
|
def restartCondition(command: str) -> bool:
|
||||||
|
return command in ["r", "restart"]
|
||||||
|
|
||||||
|
|
||||||
|
def runCondition(command: str) -> bool:
|
||||||
|
return command in ["run"]
|
@ -1,30 +0,0 @@
|
|||||||
import os
|
|
||||||
from run_command import run_command
|
|
||||||
|
|
||||||
|
|
||||||
def get_path():
|
|
||||||
pwd = run_command("pwd")
|
|
||||||
splitted = pwd.split("/")
|
|
||||||
splitted[-1] = splitted[-1].replace("\n", "")
|
|
||||||
|
|
||||||
print(splitted[: splitted.index("webscraper") + 1])
|
|
||||||
|
|
||||||
if splitted.count("webscraper") > 1 and "webscraper" in splitted:
|
|
||||||
for i in range(len(splitted) - 1, -1, -1):
|
|
||||||
potential_path = "/".join(splitted[: i + 1])
|
|
||||||
if "webscraper" in potential_path:
|
|
||||||
script_path = f"{potential_path}/scripts"
|
|
||||||
if os.path.isdir(script_path):
|
|
||||||
return potential_path
|
|
||||||
else:
|
|
||||||
return "This is not a valid webscraper project."
|
|
||||||
else:
|
|
||||||
return "/".join(splitted[: splitted.index("webscraper") + 1])
|
|
||||||
|
|
||||||
|
|
||||||
def run_main():
|
|
||||||
print(get_path())
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
run_main()
|
|
@ -1,13 +0,0 @@
|
|||||||
import subprocess
|
|
||||||
import sys
|
|
||||||
|
|
||||||
|
|
||||||
def run_command(command: str) -> str:
|
|
||||||
process = subprocess.run(
|
|
||||||
command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
|
|
||||||
)
|
|
||||||
if process.returncode != 0:
|
|
||||||
print(f"Error running command: {command}")
|
|
||||||
print(process.stderr.decode())
|
|
||||||
sys.exit(process.returncode)
|
|
||||||
return process.stdout.decode()
|
|
@ -1,17 +0,0 @@
|
|||||||
import subprocess
|
|
||||||
import os
|
|
||||||
from run_command import run_command
|
|
||||||
from get_path import get_path
|
|
||||||
|
|
||||||
|
|
||||||
def run_main():
|
|
||||||
path = get_path()
|
|
||||||
try:
|
|
||||||
result = run_command(f"python3 {path}/app/main.py")
|
|
||||||
print(result)
|
|
||||||
except subprocess.CalledProcessError as e:
|
|
||||||
print(f"Error occurred: {e.stderr}")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
run_main()
|
|
@ -1,15 +1,24 @@
|
|||||||
import subprocess
|
import subprocess
|
||||||
import os
|
import os
|
||||||
from run_command import run_command
|
import sys
|
||||||
|
|
||||||
|
|
||||||
|
def run_command(command: str) -> str:
|
||||||
|
process = subprocess.run(
|
||||||
|
command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
|
||||||
|
)
|
||||||
|
if process.returncode != 0:
|
||||||
|
print(f"Error running command: {command}")
|
||||||
|
print(process.stderr.decode())
|
||||||
|
sys.exit(process.returncode)
|
||||||
|
return process.stdout.decode()
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
docker_compose_file = os.getenv(
|
docker_compose_file = os.getenv("DOCKER_COMPOSE_FILE", "./app/docker-compose.yaml")
|
||||||
"DOCKER_COMPOSE_FILE", "$WEBSCRAPER/app/docker-compose.yaml"
|
|
||||||
)
|
|
||||||
service_name = os.getenv("SERVICE_NAME", "webscraper")
|
service_name = os.getenv("SERVICE_NAME", "webscraper")
|
||||||
script_name = os.getenv("SCRIPT_NAME", "main.py")
|
script_name = os.getenv("SCRIPT_NAME", "main.py")
|
||||||
try:
|
|
||||||
print("Starting Docker Compose services...\n")
|
print("Starting Docker Compose services...\n")
|
||||||
run_command(f"docker compose -f {docker_compose_file} up -d")
|
run_command(f"docker compose -f {docker_compose_file} up -d")
|
||||||
|
|
||||||
@ -17,9 +26,6 @@ def main():
|
|||||||
|
|
||||||
print("Stopping and removing Docker Compose services...")
|
print("Stopping and removing Docker Compose services...")
|
||||||
run_command(f"docker compose -f {docker_compose_file} down")
|
run_command(f"docker compose -f {docker_compose_file} down")
|
||||||
except subprocess.CalledProcessError as e:
|
|
||||||
print("An error occurred while running the script.")
|
|
||||||
print(e)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
@ -1,57 +0,0 @@
|
|||||||
from run_command import run_command
|
|
||||||
|
|
||||||
|
|
||||||
def quitCondition(command: str) -> bool:
|
|
||||||
return command in ["q", "quit", "exit", "stop"]
|
|
||||||
|
|
||||||
|
|
||||||
def quitService(path: str):
|
|
||||||
print("Stopping and removing Docker Compose services...")
|
|
||||||
run_command(f"docker compose -f {path}/app/docker-compose.yaml down")
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def helpCondition(command: str) -> bool:
|
|
||||||
return command in ["h", "help"]
|
|
||||||
|
|
||||||
|
|
||||||
def clearCondition(command: str) -> bool:
|
|
||||||
return command in ["c", "clear", "cls"]
|
|
||||||
|
|
||||||
|
|
||||||
def systemCommand(command: str) -> str:
|
|
||||||
words = command[1:].split()
|
|
||||||
if words[0] == "":
|
|
||||||
return "Command not found. Write 'h' for help."
|
|
||||||
print(
|
|
||||||
run_command(
|
|
||||||
f'docker exec -it webscraper {" ".join(words)}',
|
|
||||||
)
|
|
||||||
)
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def restartCondition(command: str) -> bool:
|
|
||||||
return command in ["r", "restart"]
|
|
||||||
|
|
||||||
|
|
||||||
def restartService(path: str):
|
|
||||||
print("Restarting Docker Compose services...")
|
|
||||||
run_command(f"docker compose -f {path}/app/docker-compose.yaml down")
|
|
||||||
run_command(f"docker compose -f {path}/app/docker-compose.yaml up -d")
|
|
||||||
print("Composed!")
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def runCondition(command: str) -> bool:
|
|
||||||
return command in ["run"]
|
|
||||||
|
|
||||||
|
|
||||||
def runService():
|
|
||||||
print("Running main.py...")
|
|
||||||
print(
|
|
||||||
run_command(
|
|
||||||
"docker exec -it webscraper python main.py",
|
|
||||||
)
|
|
||||||
)
|
|
||||||
return None
|
|
@ -1,9 +0,0 @@
|
|||||||
def help_list():
|
|
||||||
return """
|
|
||||||
["h", "help"], - for help.
|
|
||||||
["q", "quit", "exit", "stop"], - to stop program.
|
|
||||||
["c", "clear", "cls"], - to clear console.
|
|
||||||
["r", "restart"], - to restart Docker Compose services.
|
|
||||||
["run"], - to run main.py in docker container.
|
|
||||||
["$..."], - to evaluate command in docker container.
|
|
||||||
"""
|
|
@ -1,30 +1,65 @@
|
|||||||
|
import commands
|
||||||
|
import subprocess
|
||||||
import sys
|
import sys
|
||||||
from threads.commands import *
|
import threading
|
||||||
from run_command import run_command
|
|
||||||
from get_path import get_path
|
|
||||||
from threads.help_list import help_list
|
|
||||||
|
|
||||||
|
|
||||||
def prompt():
|
def prompt():
|
||||||
while True:
|
while True:
|
||||||
command = input("> ")
|
command = input("> ")
|
||||||
if quitCondition(command):
|
if commands.quitCondition(command):
|
||||||
quitService(get_path())
|
print("Stopping and removing Docker Compose services...")
|
||||||
|
subprocess.run(
|
||||||
|
"docker compose -f ../app/docker-compose.yaml down",
|
||||||
|
shell=True,
|
||||||
|
stdout=subprocess.PIPE,
|
||||||
|
stderr=subprocess.PIPE,
|
||||||
|
)
|
||||||
break
|
break
|
||||||
if helpCondition(command):
|
if commands.helpCondition(command):
|
||||||
print(help_list())
|
print(
|
||||||
|
"""
|
||||||
|
["h", "help"], - for help.
|
||||||
|
["q", "quit", "exit", "stop"], - to stop program.
|
||||||
|
["c", "clear", "cls"], - to clear console.
|
||||||
|
["r", "restart"], - to restart Docker Compose services.
|
||||||
|
["run"], - to run main.py in docker container.
|
||||||
|
["$..."], - to evaluate command in docker container.
|
||||||
|
"""
|
||||||
|
)
|
||||||
continue
|
continue
|
||||||
if clearCondition(command):
|
if commands.clearCondition(command):
|
||||||
run_command("clear")
|
print("\n" * 100)
|
||||||
continue
|
continue
|
||||||
if command.startswith("$"):
|
if command.startswith("$"):
|
||||||
systemCommand(command)
|
print(commands.systemCommand(command))
|
||||||
continue
|
continue
|
||||||
if restartCondition(command):
|
if commands.restartCondition(command):
|
||||||
restartService(get_path())
|
print("Restarting Docker Compose services...")
|
||||||
|
subprocess.run(
|
||||||
|
"docker compose -f ../app/docker-compose.yaml down",
|
||||||
|
shell=True,
|
||||||
|
stdout=subprocess.PIPE,
|
||||||
|
stderr=subprocess.PIPE,
|
||||||
|
)
|
||||||
|
subprocess.run(
|
||||||
|
"docker compose -f ../app/docker-compose.yaml up -d",
|
||||||
|
shell=True,
|
||||||
|
stdout=subprocess.PIPE,
|
||||||
|
stderr=subprocess.PIPE,
|
||||||
|
)
|
||||||
|
print("Composed!")
|
||||||
continue
|
continue
|
||||||
if runCondition(command):
|
if commands.runCondition(command):
|
||||||
runService()
|
print("Running main.py...")
|
||||||
|
print(
|
||||||
|
subprocess.run(
|
||||||
|
"docker exec -it webscraper python main.py",
|
||||||
|
shell=True,
|
||||||
|
stdout=subprocess.PIPE,
|
||||||
|
stderr=subprocess.PIPE,
|
||||||
|
).stdout.decode()
|
||||||
|
)
|
||||||
continue
|
continue
|
||||||
if command == "":
|
if command == "":
|
||||||
continue
|
continue
|
||||||
|
@ -1,20 +1,26 @@
|
|||||||
|
import subprocess
|
||||||
import time
|
import time
|
||||||
import os
|
import os
|
||||||
import threading
|
import threading
|
||||||
from threads.prompt import prompt
|
from threads.prompt import prompt
|
||||||
from run_command import run_command
|
|
||||||
from get_path import get_path
|
|
||||||
|
def run_command(command: str) -> str:
|
||||||
|
process = subprocess.run(
|
||||||
|
command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
|
||||||
|
)
|
||||||
|
if process.returncode != 0:
|
||||||
|
print(f"Error running command: {command}")
|
||||||
|
return process.stderr.decode()
|
||||||
|
return process.stdout.decode()
|
||||||
|
|
||||||
|
|
||||||
thread = threading.Thread(target=prompt)
|
thread = threading.Thread(target=prompt)
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
path = get_path()
|
|
||||||
docker_compose_file = os.getenv(
|
|
||||||
"DOCKER_COMPOSE_FILE", f"{path}/app/docker-compose.yaml"
|
|
||||||
)
|
|
||||||
print("Starting Docker Compose services...")
|
print("Starting Docker Compose services...")
|
||||||
run_command(f"docker compose -f {docker_compose_file} up -d")
|
run_command("docker compose -f ../app/docker-compose.yaml up -d")
|
||||||
print("Composed!\n")
|
print("Composed!\n")
|
||||||
print("Running main.py...")
|
print("Running main.py...")
|
||||||
print(run_command("docker exec -it webscraper python main.py"))
|
print(run_command("docker exec -it webscraper python main.py"))
|
||||||
@ -25,9 +31,10 @@ def main():
|
|||||||
print("\nWatching for changes...")
|
print("\nWatching for changes...")
|
||||||
thread.start()
|
thread.start()
|
||||||
|
|
||||||
|
path_to_watch = "/home/paprykdev/uni/webscraper/app"
|
||||||
before = {
|
before = {
|
||||||
f: os.stat(os.path.join(path, "app", f)).st_mtime
|
f: os.stat(os.path.join(path_to_watch, f)).st_mtime
|
||||||
for f in os.listdir(os.path.join(path, "app"))
|
for f in os.listdir(path_to_watch)
|
||||||
if f.endswith(".py")
|
if f.endswith(".py")
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -36,8 +43,8 @@ def main():
|
|||||||
break
|
break
|
||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
after = {
|
after = {
|
||||||
f: os.stat(os.path.join(path, "app", f)).st_mtime
|
f: os.stat(os.path.join(path_to_watch, f)).st_mtime
|
||||||
for f in os.listdir(os.path.join(path, "app"))
|
for f in os.listdir(path_to_watch)
|
||||||
if f.endswith(".py")
|
if f.endswith(".py")
|
||||||
}
|
}
|
||||||
for f in before:
|
for f in before:
|
||||||
|
Loading…
Reference in New Issue
Block a user