Added functions, new regular expressions and changed the library
This commit is contained in:
parent
364798ca52
commit
005ba17744
83
projekt.py
83
projekt.py
@ -1,50 +1,71 @@
|
|||||||
import fitz
|
import re
|
||||||
import re
|
from pdfminer.high_level import extract_text
|
||||||
|
|
||||||
my_path = r"C:\Users\DELL\Downloads\A-24VU-00511.PDF.pdf"
|
|
||||||
|
my_path = r"C:\Users\DELL\Downloads\Kopia 1_240014-333-361-389-399-433-441-448-464-481_2024_06_20_CE240143 (INVOICE - ERD-FVZ%i9210621%i922024%i92NZ-1 - 1 - A1) - 1.pdf.pdf"
|
||||||
|
#my_path = r"C:/Users/DELL/Downloads/A-24VU-00511.PDF.pdf"
|
||||||
|
|
||||||
|
|
||||||
|
#regex
|
||||||
postal_code_pattern = r"(?<!\S)(?:(?:[A-Za-z]{3,}\s+)(\d{5}|\d{2}-\d{3})|(\d{5}|\d{2}-\d{3})(?:\s+[A-Za-z]{3,}))(?!\S)"
|
postal_code_pattern = r"(?<!\S)(?:(?:[A-Za-z]{3,}\s+)(\d{5}|\d{2}-\d{3})|(\d{5}|\d{2}-\d{3})(?:\s+[A-Za-z]{3,}))(?!\S)"
|
||||||
with fitz.open(my_path) as doc:
|
postal_code_pattern2 = r"(?<!\S)(?:[A-Za-z]{3,}\s+-?\s+(\d{5}|\d{2}-\d{3})|(\d{5}|\d{2}-\d{3})\s+-?\s+[A-Za-z]{3,})(?!\S)"
|
||||||
order_numbers = []
|
date_pattern = r'\b\d{2}/\d{2}/\d{2,4}\b'
|
||||||
order_dates = []
|
reference_pattern = r"ref\.\s*[^0-9]*?([1-9]\d{4,})"
|
||||||
buyers = []
|
|
||||||
|
|
||||||
for page in doc:
|
order_numbers = []
|
||||||
text = page.get_text("text")
|
order_dates = set()
|
||||||
|
buyers = []
|
||||||
|
|
||||||
if "Your ref. no. PO" in text:
|
text = extract_text(my_path).lower()
|
||||||
parts = text.split("Your ref. no. PO")
|
|
||||||
for part in parts[1:]:
|
|
||||||
order_number = part.split()[0].strip()
|
|
||||||
order_numbers.append(order_number)
|
|
||||||
|
|
||||||
date_pattern = r'\b\d{2}/\d{2}/\d{2,4}\b'
|
def dates(text):
|
||||||
matches = re.findall(date_pattern, text)
|
matches = re.findall(date_pattern, text)
|
||||||
order_dates.extend(matches)
|
for match in matches:
|
||||||
|
order_dates.add(match)
|
||||||
|
|
||||||
|
def ref_numbers(text):
|
||||||
|
matches = re.findall(reference_pattern, text)
|
||||||
|
for match in matches:
|
||||||
|
order_numbers.append(match)
|
||||||
|
|
||||||
|
|
||||||
lines = text.splitlines()
|
def company(text):
|
||||||
for i in range(1, len(lines)):
|
lines = text.splitlines()
|
||||||
line = lines[i]
|
for i in range(len(lines)):
|
||||||
|
line = lines[i]
|
||||||
|
|
||||||
|
# Sprawdzenie, czy linia zawiera kod pocztowy
|
||||||
|
if re.search(postal_code_pattern, line) or re.search(postal_code_pattern2, line):
|
||||||
|
# Szukanie linii powyżej
|
||||||
|
line_above = next((lines[j].strip() for j in range(i - 1, -1, -1) if len(lines[j].strip()) > 0), '')
|
||||||
|
# Szukanie linii z nazwą firmy
|
||||||
|
line_2 = next((lines[j].strip() for j in range(i - 1, -1, -1) if len(lines[j].strip()) > 0 and 'ul.' not in lines[j].strip() and lines[j].strip() != line_above), '')
|
||||||
|
# Szukanie linii poniżej
|
||||||
|
line_below = next((lines[j].strip() for j in range(i + 1, len(lines)) if len(lines[j].strip()) > 0), '')
|
||||||
|
|
||||||
if re.search(postal_code_pattern, line):
|
if line_2 and line_above and line and line_below:
|
||||||
|
buyer = {
|
||||||
line_2 = lines[i-2] if i>0 else ''
|
|
||||||
line_above = lines[i - 1] if i > 0 else ''
|
|
||||||
line_below = lines[i + 1] if i + 1 < len(lines) else ''
|
|
||||||
|
|
||||||
buyers.append({
|
|
||||||
'line_2': line_2,
|
'line_2': line_2,
|
||||||
'line_above': line_above,
|
'line_above': line_above,
|
||||||
'postal_code_line': line,
|
'postal_code_line': line,
|
||||||
'line_below': line_below
|
'line_below': line_below
|
||||||
})
|
}
|
||||||
|
if buyer not in buyers:
|
||||||
|
buyers.append(buyer)
|
||||||
|
|
||||||
|
dates(text)
|
||||||
|
ref_numbers(text)
|
||||||
|
company(text)
|
||||||
|
|
||||||
|
order_dates = list(order_dates)
|
||||||
|
|
||||||
print("Reference numbers:", order_numbers)
|
print("Reference numbers:", order_numbers)
|
||||||
print("Document dates:", order_dates)
|
print("Document dates:", order_dates)
|
||||||
print("Buyers: ")
|
print("Buyers: ")
|
||||||
|
|
||||||
for context in buyers:
|
for context in buyers:
|
||||||
print("Company: ", context['line_2'])
|
print("Company:", context['line_2'])
|
||||||
print("St. :", context['line_above'])
|
print("St.:", context['line_above'])
|
||||||
print("City: ", context['postal_code_line'])
|
print("City:", context['postal_code_line'])
|
||||||
print("Country:", context['line_below'])
|
print("Country:", context['line_below'])
|
||||||
print()
|
print()
|
||||||
|
Loading…
Reference in New Issue
Block a user