139 lines
4.0 KiB
Python
139 lines
4.0 KiB
Python
# -*- coding: utf-8 -*-
|
|
import sys
|
|
from selenium import webdriver
|
|
from selenium.webdriver.common.action_chains import ActionChains
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
import time
|
|
import re
|
|
import json
|
|
|
|
#login
|
|
url = 'https://www.facebook.com/login/'
|
|
browser = webdriver.Firefox()
|
|
browser.get(url)
|
|
|
|
#cookie
|
|
accept = browser.find_element_by_xpath("//button[@title='Akceptuj wszystkie']")
|
|
accept.click()
|
|
|
|
#login
|
|
login = browser.find_element_by_id("email")
|
|
password = browser.find_element_by_id("pass")
|
|
login_button = browser.find_element_by_id("loginbutton")
|
|
|
|
sing_in = ActionChains(browser)
|
|
sing_in.click(on_element = login)
|
|
sing_in.send_keys("+48 883 763 414")
|
|
sing_in.click(on_element = password)
|
|
sing_in.send_keys("23facebook11")
|
|
sing_in.click(on_element = login_button)
|
|
sing_in.perform()
|
|
|
|
#wait until new page is loaded
|
|
time.sleep(5)
|
|
|
|
#go to group page
|
|
url = 'https://facebook.com/groups/babacoolpoznan/'
|
|
browser.get(url)
|
|
time.sleep(3)
|
|
|
|
#scrolling
|
|
# i = 0
|
|
# for _ in range(5):
|
|
# browser.execute_script("window.scrollTo(0, " + str(900+i) + ");")
|
|
# i += 500
|
|
# time.sleep(3)
|
|
|
|
#get_events
|
|
html_page = browser.page_source
|
|
substring = '\/events\/.*?\?'
|
|
pattern = re.compile(substring)
|
|
result = re.findall(pattern, html_page)
|
|
del result[::3]
|
|
del result[::2]
|
|
links = []
|
|
for r in result:
|
|
links.append(r.split(" ")[0][:-1])
|
|
print(links)
|
|
|
|
#get data
|
|
canonical_url = 'https://www.facebook.com'
|
|
kategorii = ["Cele doroczynne", "Dom", "Film", "Fitnrss", "Gry", "Impreza", "Jedzenie", "Komedia", "Literature", "Muzyka", "Napoje", "Ogrodnictwo", "Rękodzieło", "Sieci kontaktów", "Sport", "Sztuka", "Taniec", "Teatr", "Wellness", "Wydarzenia religijne", "Zakupy", "Zdrowie"]
|
|
number = 0
|
|
for link in links:
|
|
url = canonical_url + link
|
|
browser.get(url)
|
|
time.sleep(3)
|
|
flag = 0
|
|
data = {}
|
|
res = requests.get(url)
|
|
html_page = res.content
|
|
soup = BeautifulSoup(html_page, 'html.parser')
|
|
script = soup.select("script[type='application/ld+json']")
|
|
result = str(script)
|
|
for i in result.split(',"'):
|
|
i = i.replace('":"', ' : ')
|
|
|
|
if 'location":{' in i:
|
|
flag = 1
|
|
elif 'description :' in i:
|
|
flag = 0
|
|
|
|
if (flag == 0) and (re.search(r'^(name|startDate|description|endDate)', i)):
|
|
string = i.split(' : ')[1]
|
|
string = string.encode().decode('unicode_escape')
|
|
string = string.encode('unicode_escape').decode('ascii') #usuwa emoji
|
|
string = re.sub(r'\\ud(.){3}', '', string)
|
|
string = string.encode().decode('unicode_escape')
|
|
string = re.sub(r'\\', '', string)
|
|
data[i.split(' : ')[0]] = string
|
|
|
|
text = []
|
|
flag = 1
|
|
elements = browser.find_elements_by_tag_name('span')
|
|
for e in elements:
|
|
if "użytkowników odpowiedziało" in e.text:
|
|
flag = 0
|
|
elif "Wybierz się ze znajomymi" in e.text or "Jak wziąć udział" in e.text:
|
|
break
|
|
|
|
if (flag == 0 and e.text != ""):
|
|
if re.search(r'[A-Za-z]', e.text):
|
|
text.append(e.text)
|
|
|
|
|
|
if len(text) > 0:
|
|
if "Wydarzenie" in text[1]:
|
|
data['organizer'] = text[1].replace("Wydarzenie", "")
|
|
|
|
if 'online' in text[2] or 'Online' in text[2]:
|
|
data['online'] = "true"
|
|
else:
|
|
data['online'] = "false"
|
|
data['place'] = [text[2]]
|
|
for t in text:
|
|
if "ul." in t:
|
|
data['place'].append(t)
|
|
elif re.match("\d+\, Poznań", t):
|
|
data['place'].apeend(t)
|
|
|
|
if 'za darmo' in text[3]:
|
|
data['price'] = 'za darmo'
|
|
elif 'Bilety':
|
|
data['price'] = ['bilet']
|
|
data['price'].append(text[4])
|
|
|
|
data["category"] = []
|
|
for t in text:
|
|
if t in kategorii:
|
|
data["category"].append(t)
|
|
|
|
# for key in data.keys():
|
|
# print(key + " : " + str(data[key]) + "\n")
|
|
|
|
file = "event_" + str(number) + ".json"
|
|
with open(file, "w") as outfile:
|
|
json.dump(data, outfile)
|
|
number =+ 1
|