Source code for cdp_scrapers.instances.seattle

#!/usr/bin/env python

from __future__ import annotations

import json
import logging
import re
import warnings
from datetime import datetime
from pathlib import Path
from urllib.error import HTTPError, URLError
from urllib.parse import parse_qs, quote_plus, urlsplit
from urllib.request import urlopen

import requests
import urllib3
from bs4 import BeautifulSoup
from cdp_backend.pipeline.ingestion_models import Person, Seat

from ..legistar_utils import (
    LEGISTAR_EV_SITE_URL,
    LEGISTAR_SESSION_DATE,
    LegistarScraper,
)
from ..scraper_utils import parse_static_file, str_simplified
from ..types import ContentURIs

###############################################################################

log = logging.getLogger(__name__)

###############################################################################

STATIC_FILE_KEY_PERSONS = "persons"
STATIC_FILE_DEFAULT_PATH = Path(__file__).parent / "seattle-static.json"

# we have discovered the city clerk accidentally entered Daniel Strauss
# instead of the correct Dan Strauss for a few events
PERSON_ALIASES = {"Dan Strauss": {"Daniel Strauss"}}

###############################################################################


[docs] class VideoIdMismatchError(ValueError): pass
[docs] class SeattleScraper(LegistarScraper): PYTHON_MUNICIPALITY_SLUG: str = "seattle" def __init__(self): """Seattle specific implementation of LegistarScraper.""" super().__init__( client="seattle", timezone="America/Los_Angeles", ignore_minutes_item_patterns=[ "This meeting also constitutes a meeting of the City Council", "In-person attendance is currently prohibited", "Times listed are estimated", "has been cancelled", "Deputy City Clerk", "Executive Sessions are closed to the public", # Sometimes will have number after "Session", e.g. "Session I" r"Executive Session \S*\s*on Pending, Potential, or Actual Litigation", "Items of Business", # Common to see "CITY COUNCIL:", # Or more generally "{body name}:" # Check for last char ":" r".+:$", "Pursuant to Washington State", ], static_data=parse_static_file( STATIC_FILE_DEFAULT_PATH, "America/Los_Angeles" ), person_aliases=PERSON_ALIASES, ) # TODO: larger fix later # # try: # urlopen("https://seattlechannel.org/") # except URLError: # pass # else: # raise Exception( # "seattlechannel.org may have fixed their SSL cert. " # "Check and fix 'requests.get(*, verify=False)' calls" # )
[docs] def parse_content_uris( self, video_page_url: str, event_short_date: str ) -> list[ContentURIs]: """ Return URLs for videos and captions parsed from seattlechannel.org web page. Parameters ---------- video_page_url: str URL to a web page for a particular meeting video event_short_date: str datetime representing the meeting's date, used for verification m/d/yy Returns ------- content_uris: List[ContentURIs] List of ContentURIs objects for each session found. Raises ------ VideoIdMismatchError If date on the video web page does not match the event date. See Also -------- get_content_uris """ with warnings.catch_warnings(): warnings.simplefilter( "ignore", category=urllib3.exceptions.InsecureRequestWarning, ) # now load the page to get the actual video url soup = BeautifulSoup( requests.get(video_page_url, verify=False).text, "html.parser", ) # <script> # ... # playerInstance.setup({ # sources: [ # { # file: "//...mp4", # label: "Auto" # } # ], # ... # tracks: [{ # file: "documents/seattlechannel/closedcaption/2021/...vtt", # label: "English", # kind: "captions", # "default": true # } # # ], # ... # entire script tag text that has the video player setup call video_script_block = soup.find( "script", text=re.compile(r"playerInstance\.setup") ) if not video_script_block: log.warning( f"Couldn't find 'playerInstance.setup()' block on {video_page_url}.\n" "seattlechannel.org may have changed their video page html" ) return [] video_script_text = video_script_block.string # halt if event date not in video's idstring # likely means some change on video web page source / script # e.g. idstring:'Select Budget Committee Session II 10/14/21' # idstring:'City Council 10/11/21' if not re.search(f"idstring:.+{event_short_date}.+", video_script_text): video_id_error = VideoIdMismatchError( f"event date {event_short_date} not in video idstring.\n" f"{video_page_url} may be for a different event's video.\n" ) log.warning(str(video_id_error)) raise video_id_error # playerSetup({... # ^ player_arg_start = re.search( r"playerInstance\.setup\((\{)", video_script_text ).start(1) # ...}); # ^ # playerInstance... # more playerInstance code video_json_blob = video_script_text[ player_arg_start : player_arg_start + re.search( r"\)\;\s*\n\s*playerInstance", video_script_text[player_arg_start:] ).start(0) ] # not smart enough to make one-line regex for all the 'file's in 'sources' videos_start = video_json_blob.find("sources:") videos_end = video_json_blob.find("],", videos_start) # as shown above, url will start with // so prepend https: video_uris = [ "https:" + i for i in re.findall( r"file\:\s*\"([^\"]+)", video_json_blob[videos_start:videos_end], ) ] captions_start = video_json_blob.find("tracks:") captions_end = video_json_blob.find("],", captions_start) caption_uris = [ "https://www.seattlechannel.org/" + i for i in re.findall( r"file\:\s*\"([^\"]+)", video_json_blob[captions_start:captions_end], ) ] # use max count between videos and captions # so we don't lose any (e.g. caption = None if < # videos) list_uri = [] for i in range(max(len(video_uris), len(caption_uris))): # just in case # videos != # captions try: video_uri = video_uris[i] except IndexError: video_uri = None try: caption_uri = caption_uris[i] except IndexError: caption_uri = None list_uri.append(ContentURIs(video_uri=video_uri, caption_uri=caption_uri)) if len(list_uri) == 0: log.debug(f"No video URI found on {video_page_url}") return list_uri
[docs] @staticmethod def roman_to_int(roman: str): """ Roman numeral to an integer. Parameters ---------- roman: str Roman numeral string Returns ------- int Input roman numeral as integer References ---------- https://www.w3resource.com/python-exercises/class-exercises/python-class-exercise-2.php """ rom_val = {"I": 1, "V": 5, "X": 10, "L": 50, "C": 100, "D": 500, "M": 1000} int_val = 0 for i in range(len(roman)): if i > 0 and rom_val[roman[i]] > rom_val[roman[i - 1]]: # subtract twice the i-1 th number since it has already been added int_val += rom_val[roman[i]] - 2 * rom_val[roman[i - 1]] else: int_val += rom_val[roman[i]] return int_val
[docs] def get_video_page_urls( self, video_list_page_url: str, event_short_date: str ) -> list[str]: """ Return URLs to web pages hosting videos for meetings from event_short_date. Parameters ---------- video_list_page_url: str URL to web page listing videos featuring the responsible group/body for the event described in legistar_ev. e.g. http://www.seattlechannel.org/BudgetCommittee?Mode2=Video event_short_date: str datetime representing the meeting's date m/d/yy Returns ------- video_page_urls: List[str] web page URL per video See Also -------- get_content_uris """ with warnings.catch_warnings(): warnings.simplefilter( "ignore", category=urllib3.exceptions.InsecureRequestWarning, ) # request list of videos for this group on this event's date response = requests.get( # this is the query sent by the "filter" button on the web page f"{video_list_page_url}&filterTerm={quote_plus(event_short_date)}" "&itemsPerPage=25&toggleDisplay=Thumbnail_Excerpt", verify=False, ).text # <div class="paginationContainer"> # <div class="row borderBottomNone paginationItem"> # <div class="col-xs-12 col-sm-4 col-md-3"> # <a href='/BudgetCommittee?videoid=x132213'... </a> # </div> # <div class="col-xs-12 col-sm-8 col-md-9"> # <div class="titleDateContainer"> # <h2 class="paginationTitle"> # <a href="/BudgetCommittee?videoid=x132213" ... </a> # </h2> # <div class="videoDate">10/14/2021</div> # </div> # <div class="titleExcerptText"><p><em>Pursuant to Washington ... </div> # </div> # </div> # <div class="row borderBottomNone paginationItem"> session_video_page_urls: dict[int, str] = {} # want <a> tag in the <div> with # title attribute that contains the event date, # onclick attribute that calls loadJWPlayer, # href attribute that contains videoid soup = BeautifulSoup(response, "html.parser") for link in soup.find("div", class_="paginationContainer",).find_all( "a", href=re.compile("videoid"), onclick=re.compile("loadJWPlayer"), title=re.compile(event_short_date), ): # e.g. "Session I m/d/yy" match = re.search( r"session\s(?P<session_int>\d*)(?P<session_roman>[IVXLCDM]*)", link["title"], re.IGNORECASE, ) if match: if match.group("session_int"): session_video_page_urls[ int(match.group("session_int")) ] = f"https://www.seattlechannel.org{link['href']}" elif match.group("session_roman"): session_video_page_urls[ int(SeattleScraper.roman_to_int(match.group("session_roman"))) ] = f"https://www.seattlechannel.org{link['href']}" else: session_video_page_urls[ len(session_video_page_urls) ] = f"https://www.seattlechannel.org{link['href']}" # ordered by session number return [ session_video_page_urls[session] for session in sorted(session_video_page_urls.keys()) ]
[docs] def get_content_uris(self, legistar_ev: dict) -> list[ContentURIs]: # noqa: C901 """ Return URLs for videos and captions parsed from seattlechannel.org web page. Parameters ---------- legistar_ev: Dict Data for one Legistar Event. Returns ------- content_uris: List[ContentURIs] List of ContentURIs objects for each session found. See Also -------- parse_content_uris Notes ----- get_events() calls get_content_uris() to get video and caption URIs. get_content_uris() gets video page URL from EventInSiteURL. If "videoid" in video page URL, calls parse_content_uris(). Else, calls get_video_page_urls() to get proper video page URL with "videoid", then calls parse_content_uris(). get_events() -> get_content_uris() -> parse_content_uris() or -> get_video_page_urls(), parse_content_uris() """ try: # a td tag with a certain id pattern containing url to video with urlopen(legistar_ev[LEGISTAR_EV_SITE_URL]) as resp: soup = BeautifulSoup(resp.read(), "html.parser") except URLError or HTTPError: log.debug(f"Failed to open {legistar_ev[LEGISTAR_EV_SITE_URL]}") return [] try: # this gets us the url for the web PAGE containing the video video_page_url = soup.find( "a", id=re.compile(r"ct\S*_ContentPlaceHolder\S*_hypVideo"), class_="videolink", )["href"] log.debug(f"{legistar_ev[LEGISTAR_EV_SITE_URL]} -> {video_page_url}") # catch if find() didn't find video web page url (no <a id=... href=.../>) except KeyError: log.debug(f"No URL for video page on {legistar_ev[LEGISTAR_EV_SITE_URL]}") return [] def get_uris_for_date(event_date: datetime, year_str: str) -> list[ContentURIs]: # want no leading zero for month or day event_short_date = f"{event_date.month}/" f"{event_date.day}/" f"{year_str}" try: # Some meetings will have text like "Session II" in "Meeting location". # For those, don't bother verifying video page URL. # They are multi-session and we need to call get_video_page_urls() if ( "session ii" not in soup.find( "span", id=re.compile(r"ctl\S*_ContentPlaceHolder\S*_lblLocation$"), ).text.lower() ): try: if parse_qs(urlsplit(video_page_url).query)["videoid"]: # video link contains specific videoid return self.parse_content_uris( video_page_url, event_short_date ) except KeyError: pass # at this point video_page_url points to generic video list page like # http://www.seattlechannel.org/BudgetCommittee?Mode2=Video return [ uris # 1 web page per session video for this multi-session event for page_url in self.get_video_page_urls( video_page_url, event_short_date ) # video and caption urls on the session video web page for uris in self.parse_content_uris(page_url, event_short_date) ] except VideoIdMismatchError: return [] event_date = datetime.fromisoformat(legistar_ev[LEGISTAR_SESSION_DATE]) # Try first using 2-digit year without century and then as 4-digit with century # to account for old and new date strings used on Seattle Channel uris = get_uris_for_date(event_date, event_date.strftime("%y")) uris = ( uris if any(uris) else get_uris_for_date(event_date, event_date.strftime("%Y")) ) return uris
[docs] @staticmethod def get_person_picture_url(person_www: str) -> str | None: """ Parse person_www and return banner image used on the web page. Parameters ---------- person_www: str e.g. http://www.seattle.gov/council/pedersen Returns ------- Image URL: Optional[str] Full URL to banner image displayed on person_www """ try: with urlopen(person_www) as resp: soup = BeautifulSoup(resp.read(), "html.parser") except URLError or HTTPError: log.debug("Failed to open {person_www}") return None # <div class="featureWrapperShort" style="background-image: # url('/assets/images/Council/Members/Pedersen/ # Councilmember-Alex-Pedersen_homepage-banner.jpg')"></div> div = soup.find( "div", class_="featureWrapperShort", style=re.compile(r"background\-image") ) if not div: return None try: # now get just the image uri '/assets/...' return "http://www.seattle.gov/" + re.search( r"url\('([^']+)", div["style"] ).group(1) except AttributeError: pass return None
[docs] @staticmethod def get_static_person_info() -> list[Person] | None: # noqa: C901 """ Return partial Persons with static long-term information. Returns ------- persons: Optional[List[Person]] """ try: # has table with all council members with urlopen("https://seattle.legistar.com/MainBody.aspx") as resp: soup = BeautifulSoup(resp.read(), "html.parser") except URLError or HTTPError: log.debug("Failed to open https://seattle.legistar.com/MainBody.aspx") return None static_person_info: list[Person] = [] # <tr id="ctl00_ContentPlaceHolder1_gridPeople_ctl00__0" ...> # <td class="rgSorted" style="white-space:nowrap;"> # <a ...>Alex Pedersen</a> # </td> # <td>Councilmember<br /><em>Council Position No. 4</em></td> # <td>1/1/2020</td> # <td style="white-space:nowrap;"> # <span ...>12/31/2023</span> # </td> # <td style="white-space:nowrap;"> # <a ...>Alex.Pedersen@seattle.gov</a> # </td> # <td style="white-space:nowrap;"> # <a ...>http://www.seat...ouncil/pedersen</a> # </td> # </tr> for tr in soup.find_all( "tr", # each row with this id in said table is for a council member id=re.compile(r"ctl\d+_ContentPlaceHolder\d+_gridPeople_ctl\d+__\d+"), ): # <a> tag in this row with this id has full name try: name = str_simplified( tr.find( "a", id=re.compile( r"ctl\d*_ContentPlaceHolder\d*" r"_gridPeople_ctl\d*_ctl\d*_hypPerson" ), ).text ) except AttributeError: # find() returned None continue # <a> tag in this row with this id has url # for web page with more info on this person try: person_picture_url = SeattleScraper.get_person_picture_url( tr.find( "a", id=re.compile( r"ctl\d*_ContentPlaceHolder\d*" r"_gridPeople_ctl\d*_ctl\d*_hypWebSite" ), )["href"] ) except AttributeError: # find() returned None continue # <td> in this row with <br> and <em> has seat name # <td>Councilmember<br /><em>Council Position No. 4</em></td> # the seat is the <em>-phasized text try: seat = Seat( name=str_simplified( [ td for td in tr.find_all("td") if td.find("br") is not None and td.find("em") is not None ][0].em.text ) ) except IndexError: # accessed 0-th item in an empty list [] continue # from "Council Position No. 4" # Seat.electoral_area: District 4 # Seat.name: Position 4 # from "At-large Council Position No. 9" # Seat.electoral_area: At-large # Seat.name: Position 9 match = re.search( r"(?P<atlarge>At.*large)?.*position.*(?P<position_num>\d+)", seat.name, re.IGNORECASE, ) if match: seat_number = match.group("position_num") seat.electoral_area = f"District {seat_number}" if match.group("atlarge"): seat.electoral_area = "Citywide" seat.name = f"Position {seat_number}" static_person_info.append( Person(name=name, picture_uri=person_picture_url, seat=seat) ) return static_person_info
[docs] @staticmethod def dump_static_info(file_path: str) -> bool: """ Save static data in json format. Parameters ---------- file_path: str Static data dump file path Returns ------- bool True if some data was saved in file_path See Also -------- LegistarScraper.inject_known_data """ static_person_info = {} for person in SeattleScraper.get_static_person_info(): # save this Person in json keyed by the name static_person_info[person.name] = json.loads(person.to_json()) if not static_person_info: return False with open(file_path, "w") as dump: dump.write( json.dumps({STATIC_FILE_KEY_PERSONS: static_person_info}, indent=4) ) return True