#!/usr/bin/env python
from __future__ import annotations
import json
import logging
import re
import warnings
from datetime import datetime
from pathlib import Path
from urllib.error import HTTPError, URLError
from urllib.parse import parse_qs, quote_plus, urlsplit
from urllib.request import urlopen
import requests
import urllib3
from bs4 import BeautifulSoup
from cdp_backend.pipeline.ingestion_models import Person, Seat
from ..legistar_utils import (
LEGISTAR_EV_SITE_URL,
LEGISTAR_SESSION_DATE,
LegistarScraper,
)
from ..scraper_utils import parse_static_file, str_simplified
from ..types import ContentURIs
###############################################################################
log = logging.getLogger(__name__)
###############################################################################
STATIC_FILE_KEY_PERSONS = "persons"
STATIC_FILE_DEFAULT_PATH = Path(__file__).parent / "seattle-static.json"
# we have discovered the city clerk accidentally entered Daniel Strauss
# instead of the correct Dan Strauss for a few events
PERSON_ALIASES = {"Dan Strauss": {"Daniel Strauss"}}
###############################################################################
[docs]
class VideoIdMismatchError(ValueError):
pass
[docs]
class SeattleScraper(LegistarScraper):
PYTHON_MUNICIPALITY_SLUG: str = "seattle"
def __init__(self):
"""Seattle specific implementation of LegistarScraper."""
super().__init__(
client="seattle",
timezone="America/Los_Angeles",
ignore_minutes_item_patterns=[
"This meeting also constitutes a meeting of the City Council",
"In-person attendance is currently prohibited",
"Times listed are estimated",
"has been cancelled",
"Deputy City Clerk",
"Executive Sessions are closed to the public",
# Sometimes will have number after "Session", e.g. "Session I"
r"Executive Session \S*\s*on Pending, Potential, or Actual Litigation",
"Items of Business",
# Common to see "CITY COUNCIL:",
# Or more generally "{body name}:"
# Check for last char ":"
r".+:$",
"Pursuant to Washington State",
],
static_data=parse_static_file(
STATIC_FILE_DEFAULT_PATH, "America/Los_Angeles"
),
person_aliases=PERSON_ALIASES,
)
# TODO: larger fix later
#
# try:
# urlopen("https://seattlechannel.org/")
# except URLError:
# pass
# else:
# raise Exception(
# "seattlechannel.org may have fixed their SSL cert. "
# "Check and fix 'requests.get(*, verify=False)' calls"
# )
[docs]
def parse_content_uris(
self, video_page_url: str, event_short_date: str
) -> list[ContentURIs]:
"""
Return URLs for videos and captions parsed from seattlechannel.org web page.
Parameters
----------
video_page_url: str
URL to a web page for a particular meeting video
event_short_date: str
datetime representing the meeting's date, used for verification m/d/yy
Returns
-------
content_uris: List[ContentURIs]
List of ContentURIs objects for each session found.
Raises
------
VideoIdMismatchError
If date on the video web page does not match the event date.
See Also
--------
get_content_uris
"""
with warnings.catch_warnings():
warnings.simplefilter(
"ignore",
category=urllib3.exceptions.InsecureRequestWarning,
)
# now load the page to get the actual video url
soup = BeautifulSoup(
requests.get(video_page_url, verify=False).text,
"html.parser",
)
# <script>
# ...
# playerInstance.setup({
# sources: [
# {
# file: "//...mp4",
# label: "Auto"
# }
# ],
# ...
# tracks: [{
# file: "documents/seattlechannel/closedcaption/2021/...vtt",
# label: "English",
# kind: "captions",
# "default": true
# }
#
# ],
# ...
# entire script tag text that has the video player setup call
video_script_block = soup.find(
"script", text=re.compile(r"playerInstance\.setup")
)
if not video_script_block:
log.warning(
f"Couldn't find 'playerInstance.setup()' block on {video_page_url}.\n"
"seattlechannel.org may have changed their video page html"
)
return []
video_script_text = video_script_block.string
# halt if event date not in video's idstring
# likely means some change on video web page source / script
# e.g. idstring:'Select Budget Committee Session II 10/14/21'
# idstring:'City Council 10/11/21'
if not re.search(f"idstring:.+{event_short_date}.+", video_script_text):
video_id_error = VideoIdMismatchError(
f"event date {event_short_date} not in video idstring.\n"
f"{video_page_url} may be for a different event's video.\n"
)
log.warning(str(video_id_error))
raise video_id_error
# playerSetup({...
# ^
player_arg_start = re.search(
r"playerInstance\.setup\((\{)", video_script_text
).start(1)
# ...});
# ^
# playerInstance... # more playerInstance code
video_json_blob = video_script_text[
player_arg_start : player_arg_start
+ re.search(
r"\)\;\s*\n\s*playerInstance", video_script_text[player_arg_start:]
).start(0)
]
# not smart enough to make one-line regex for all the 'file's in 'sources'
videos_start = video_json_blob.find("sources:")
videos_end = video_json_blob.find("],", videos_start)
# as shown above, url will start with // so prepend https:
video_uris = [
"https:" + i
for i in re.findall(
r"file\:\s*\"([^\"]+)",
video_json_blob[videos_start:videos_end],
)
]
captions_start = video_json_blob.find("tracks:")
captions_end = video_json_blob.find("],", captions_start)
caption_uris = [
"https://www.seattlechannel.org/" + i
for i in re.findall(
r"file\:\s*\"([^\"]+)",
video_json_blob[captions_start:captions_end],
)
]
# use max count between videos and captions
# so we don't lose any (e.g. caption = None if < # videos)
list_uri = []
for i in range(max(len(video_uris), len(caption_uris))):
# just in case # videos != # captions
try:
video_uri = video_uris[i]
except IndexError:
video_uri = None
try:
caption_uri = caption_uris[i]
except IndexError:
caption_uri = None
list_uri.append(ContentURIs(video_uri=video_uri, caption_uri=caption_uri))
if len(list_uri) == 0:
log.debug(f"No video URI found on {video_page_url}")
return list_uri
[docs]
@staticmethod
def roman_to_int(roman: str):
"""
Roman numeral to an integer.
Parameters
----------
roman: str
Roman numeral string
Returns
-------
int
Input roman numeral as integer
References
----------
https://www.w3resource.com/python-exercises/class-exercises/python-class-exercise-2.php
"""
rom_val = {"I": 1, "V": 5, "X": 10, "L": 50, "C": 100, "D": 500, "M": 1000}
int_val = 0
for i in range(len(roman)):
if i > 0 and rom_val[roman[i]] > rom_val[roman[i - 1]]:
# subtract twice the i-1 th number since it has already been added
int_val += rom_val[roman[i]] - 2 * rom_val[roman[i - 1]]
else:
int_val += rom_val[roman[i]]
return int_val
[docs]
def get_video_page_urls(
self, video_list_page_url: str, event_short_date: str
) -> list[str]:
"""
Return URLs to web pages hosting videos for meetings from event_short_date.
Parameters
----------
video_list_page_url: str
URL to web page listing videos featuring the responsible group/body
for the event described in legistar_ev.
e.g. http://www.seattlechannel.org/BudgetCommittee?Mode2=Video
event_short_date: str
datetime representing the meeting's date m/d/yy
Returns
-------
video_page_urls: List[str]
web page URL per video
See Also
--------
get_content_uris
"""
with warnings.catch_warnings():
warnings.simplefilter(
"ignore",
category=urllib3.exceptions.InsecureRequestWarning,
)
# request list of videos for this group on this event's date
response = requests.get(
# this is the query sent by the "filter" button on the web page
f"{video_list_page_url}&filterTerm={quote_plus(event_short_date)}"
"&itemsPerPage=25&toggleDisplay=Thumbnail_Excerpt",
verify=False,
).text
# <div class="paginationContainer">
# <div class="row borderBottomNone paginationItem">
# <div class="col-xs-12 col-sm-4 col-md-3">
# <a href='/BudgetCommittee?videoid=x132213'... </a>
# </div>
# <div class="col-xs-12 col-sm-8 col-md-9">
# <div class="titleDateContainer">
# <h2 class="paginationTitle">
# <a href="/BudgetCommittee?videoid=x132213" ... </a>
# </h2>
# <div class="videoDate">10/14/2021</div>
# </div>
# <div class="titleExcerptText"><p><em>Pursuant to Washington ... </div>
# </div>
# </div>
# <div class="row borderBottomNone paginationItem">
session_video_page_urls: dict[int, str] = {}
# want <a> tag in the <div> with
# title attribute that contains the event date,
# onclick attribute that calls loadJWPlayer,
# href attribute that contains videoid
soup = BeautifulSoup(response, "html.parser")
for link in soup.find("div", class_="paginationContainer",).find_all(
"a",
href=re.compile("videoid"),
onclick=re.compile("loadJWPlayer"),
title=re.compile(event_short_date),
):
# e.g. "Session I m/d/yy"
match = re.search(
r"session\s(?P<session_int>\d*)(?P<session_roman>[IVXLCDM]*)",
link["title"],
re.IGNORECASE,
)
if match:
if match.group("session_int"):
session_video_page_urls[
int(match.group("session_int"))
] = f"https://www.seattlechannel.org{link['href']}"
elif match.group("session_roman"):
session_video_page_urls[
int(SeattleScraper.roman_to_int(match.group("session_roman")))
] = f"https://www.seattlechannel.org{link['href']}"
else:
session_video_page_urls[
len(session_video_page_urls)
] = f"https://www.seattlechannel.org{link['href']}"
# ordered by session number
return [
session_video_page_urls[session]
for session in sorted(session_video_page_urls.keys())
]
[docs]
def get_content_uris(self, legistar_ev: dict) -> list[ContentURIs]: # noqa: C901
"""
Return URLs for videos and captions parsed from seattlechannel.org web page.
Parameters
----------
legistar_ev: Dict
Data for one Legistar Event.
Returns
-------
content_uris: List[ContentURIs]
List of ContentURIs objects for each session found.
See Also
--------
parse_content_uris
Notes
-----
get_events() calls get_content_uris() to get video and caption URIs.
get_content_uris() gets video page URL from EventInSiteURL.
If "videoid" in video page URL, calls parse_content_uris().
Else, calls get_video_page_urls() to get proper video page URL with "videoid",
then calls parse_content_uris().
get_events()
-> get_content_uris()
-> parse_content_uris()
or
-> get_video_page_urls(), parse_content_uris()
"""
try:
# a td tag with a certain id pattern containing url to video
with urlopen(legistar_ev[LEGISTAR_EV_SITE_URL]) as resp:
soup = BeautifulSoup(resp.read(), "html.parser")
except URLError or HTTPError:
log.debug(f"Failed to open {legistar_ev[LEGISTAR_EV_SITE_URL]}")
return []
try:
# this gets us the url for the web PAGE containing the video
video_page_url = soup.find(
"a",
id=re.compile(r"ct\S*_ContentPlaceHolder\S*_hypVideo"),
class_="videolink",
)["href"]
log.debug(f"{legistar_ev[LEGISTAR_EV_SITE_URL]} -> {video_page_url}")
# catch if find() didn't find video web page url (no <a id=... href=.../>)
except KeyError:
log.debug(f"No URL for video page on {legistar_ev[LEGISTAR_EV_SITE_URL]}")
return []
def get_uris_for_date(event_date: datetime, year_str: str) -> list[ContentURIs]:
# want no leading zero for month or day
event_short_date = f"{event_date.month}/" f"{event_date.day}/" f"{year_str}"
try:
# Some meetings will have text like "Session II" in "Meeting location".
# For those, don't bother verifying video page URL.
# They are multi-session and we need to call get_video_page_urls()
if (
"session ii"
not in soup.find(
"span",
id=re.compile(r"ctl\S*_ContentPlaceHolder\S*_lblLocation$"),
).text.lower()
):
try:
if parse_qs(urlsplit(video_page_url).query)["videoid"]:
# video link contains specific videoid
return self.parse_content_uris(
video_page_url, event_short_date
)
except KeyError:
pass
# at this point video_page_url points to generic video list page like
# http://www.seattlechannel.org/BudgetCommittee?Mode2=Video
return [
uris
# 1 web page per session video for this multi-session event
for page_url in self.get_video_page_urls(
video_page_url, event_short_date
)
# video and caption urls on the session video web page
for uris in self.parse_content_uris(page_url, event_short_date)
]
except VideoIdMismatchError:
return []
event_date = datetime.fromisoformat(legistar_ev[LEGISTAR_SESSION_DATE])
# Try first using 2-digit year without century and then as 4-digit with century
# to account for old and new date strings used on Seattle Channel
uris = get_uris_for_date(event_date, event_date.strftime("%y"))
uris = (
uris
if any(uris)
else get_uris_for_date(event_date, event_date.strftime("%Y"))
)
return uris
[docs]
@staticmethod
def get_person_picture_url(person_www: str) -> str | None:
"""
Parse person_www and return banner image used on the web page.
Parameters
----------
person_www: str
e.g. http://www.seattle.gov/council/pedersen
Returns
-------
Image URL: Optional[str]
Full URL to banner image displayed on person_www
"""
try:
with urlopen(person_www) as resp:
soup = BeautifulSoup(resp.read(), "html.parser")
except URLError or HTTPError:
log.debug("Failed to open {person_www}")
return None
# <div class="featureWrapperShort" style="background-image:
# url('/assets/images/Council/Members/Pedersen/
# Councilmember-Alex-Pedersen_homepage-banner.jpg')"></div>
div = soup.find(
"div", class_="featureWrapperShort", style=re.compile(r"background\-image")
)
if not div:
return None
try:
# now get just the image uri '/assets/...'
return "http://www.seattle.gov/" + re.search(
r"url\('([^']+)", div["style"]
).group(1)
except AttributeError:
pass
return None
[docs]
@staticmethod
def get_static_person_info() -> list[Person] | None: # noqa: C901
"""
Return partial Persons with static long-term information.
Returns
-------
persons: Optional[List[Person]]
"""
try:
# has table with all council members
with urlopen("https://seattle.legistar.com/MainBody.aspx") as resp:
soup = BeautifulSoup(resp.read(), "html.parser")
except URLError or HTTPError:
log.debug("Failed to open https://seattle.legistar.com/MainBody.aspx")
return None
static_person_info: list[Person] = []
# <tr id="ctl00_ContentPlaceHolder1_gridPeople_ctl00__0" ...>
# <td class="rgSorted" style="white-space:nowrap;">
# <a ...>Alex Pedersen</a>
# </td>
# <td>Councilmember<br /><em>Council Position No. 4</em></td>
# <td>1/1/2020</td>
# <td style="white-space:nowrap;">
# <span ...>12/31/2023</span>
# </td>
# <td style="white-space:nowrap;">
# <a ...>Alex.Pedersen@seattle.gov</a>
# </td>
# <td style="white-space:nowrap;">
# <a ...>http://www.seat...ouncil/pedersen</a>
# </td>
# </tr>
for tr in soup.find_all(
"tr",
# each row with this id in said table is for a council member
id=re.compile(r"ctl\d+_ContentPlaceHolder\d+_gridPeople_ctl\d+__\d+"),
):
# <a> tag in this row with this id has full name
try:
name = str_simplified(
tr.find(
"a",
id=re.compile(
r"ctl\d*_ContentPlaceHolder\d*"
r"_gridPeople_ctl\d*_ctl\d*_hypPerson"
),
).text
)
except AttributeError:
# find() returned None
continue
# <a> tag in this row with this id has url
# for web page with more info on this person
try:
person_picture_url = SeattleScraper.get_person_picture_url(
tr.find(
"a",
id=re.compile(
r"ctl\d*_ContentPlaceHolder\d*"
r"_gridPeople_ctl\d*_ctl\d*_hypWebSite"
),
)["href"]
)
except AttributeError:
# find() returned None
continue
# <td> in this row with <br> and <em> has seat name
# <td>Councilmember<br /><em>Council Position No. 4</em></td>
# the seat is the <em>-phasized text
try:
seat = Seat(
name=str_simplified(
[
td
for td in tr.find_all("td")
if td.find("br") is not None and td.find("em") is not None
][0].em.text
)
)
except IndexError:
# accessed 0-th item in an empty list []
continue
# from "Council Position No. 4"
# Seat.electoral_area: District 4
# Seat.name: Position 4
# from "At-large Council Position No. 9"
# Seat.electoral_area: At-large
# Seat.name: Position 9
match = re.search(
r"(?P<atlarge>At.*large)?.*position.*(?P<position_num>\d+)",
seat.name,
re.IGNORECASE,
)
if match:
seat_number = match.group("position_num")
seat.electoral_area = f"District {seat_number}"
if match.group("atlarge"):
seat.electoral_area = "Citywide"
seat.name = f"Position {seat_number}"
static_person_info.append(
Person(name=name, picture_uri=person_picture_url, seat=seat)
)
return static_person_info
[docs]
@staticmethod
def dump_static_info(file_path: str) -> bool:
"""
Save static data in json format.
Parameters
----------
file_path: str
Static data dump file path
Returns
-------
bool
True if some data was saved in file_path
See Also
--------
LegistarScraper.inject_known_data
"""
static_person_info = {}
for person in SeattleScraper.get_static_person_info():
# save this Person in json keyed by the name
static_person_info[person.name] = json.loads(person.to_json())
if not static_person_info:
return False
with open(file_path, "w") as dump:
dump.write(
json.dumps({STATIC_FILE_KEY_PERSONS: static_person_info}, indent=4)
)
return True