HighNeed_projekt_inz/get_events_FB.py

139 lines
4.0 KiB
Python

# -*- coding: utf-8 -*-
import sys
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
import requests
from bs4 import BeautifulSoup
import time
import re
import json
#login
url = 'https://www.facebook.com/login/'
browser = webdriver.Firefox()
browser.get(url)
#cookie
accept = browser.find_element_by_xpath("//button[@title='Akceptuj wszystkie']")
accept.click()
#login
login = browser.find_element_by_id("email")
password = browser.find_element_by_id("pass")
login_button = browser.find_element_by_id("loginbutton")
sing_in = ActionChains(browser)
sing_in.click(on_element = login)
sing_in.send_keys("+48 883 763 414")
sing_in.click(on_element = password)
sing_in.send_keys("23facebook11")
sing_in.click(on_element = login_button)
sing_in.perform()
#wait until new page is loaded
time.sleep(5)
#go to group page
url = 'https://facebook.com/groups/babacoolpoznan/'
browser.get(url)
time.sleep(3)
#scrolling
# i = 0
# for _ in range(5):
# browser.execute_script("window.scrollTo(0, " + str(900+i) + ");")
# i += 500
# time.sleep(3)
#get_events
html_page = browser.page_source
substring = '\/events\/.*?\?'
pattern = re.compile(substring)
result = re.findall(pattern, html_page)
del result[::3]
del result[::2]
links = []
for r in result:
links.append(r.split(" ")[0][:-1])
print(links)
#get data
canonical_url = 'https://www.facebook.com'
kategorii = ["Cele doroczynne", "Dom", "Film", "Fitnrss", "Gry", "Impreza", "Jedzenie", "Komedia", "Literature", "Muzyka", "Napoje", "Ogrodnictwo", "Rękodzieło", "Sieci kontaktów", "Sport", "Sztuka", "Taniec", "Teatr", "Wellness", "Wydarzenia religijne", "Zakupy", "Zdrowie"]
number = 0
for link in links:
url = canonical_url + link
browser.get(url)
time.sleep(3)
flag = 0
data = {}
res = requests.get(url)
html_page = res.content
soup = BeautifulSoup(html_page, 'html.parser')
script = soup.select("script[type='application/ld+json']")
result = str(script)
for i in result.split(',"'):
i = i.replace('":"', ' : ')
if 'location":{' in i:
flag = 1
elif 'description :' in i:
flag = 0
if (flag == 0) and (re.search(r'^(name|startDate|description|endDate)', i)):
string = i.split(' : ')[1]
string = string.encode().decode('unicode_escape')
string = string.encode('unicode_escape').decode('ascii') #usuwa emoji
string = re.sub(r'\\ud(.){3}', '', string)
string = string.encode().decode('unicode_escape')
string = re.sub(r'\\', '', string)
data[i.split(' : ')[0]] = string
text = []
flag = 1
elements = browser.find_elements_by_tag_name('span')
for e in elements:
if "użytkowników odpowiedziało" in e.text:
flag = 0
elif "Wybierz się ze znajomymi" in e.text or "Jak wziąć udział" in e.text:
break
if (flag == 0 and e.text != ""):
if re.search(r'[A-Za-z]', e.text):
text.append(e.text)
if len(text) > 0:
if "Wydarzenie" in text[1]:
data['organizer'] = text[1].replace("Wydarzenie", "")
if 'online' in text[2] or 'Online' in text[2]:
data['online'] = "true"
else:
data['online'] = "false"
data['place'] = [text[2]]
for t in text:
if "ul." in t:
data['place'].append(t)
elif re.match("\d+\, Poznań", t):
data['place'].apeend(t)
if 'za darmo' in text[3]:
data['price'] = 'za darmo'
elif 'Bilety':
data['price'] = ['bilet']
data['price'].append(text[4])
data["category"] = []
for t in text:
if t in kategorii:
data["category"].append(t)
# for key in data.keys():
# print(key + " : " + str(data[key]) + "\n")
file = "event_" + str(number) + ".json"
with open(file, "w") as outfile:
json.dump(data, outfile)
number =+ 1