Source code for cdp_scrapers.instances.portland

from __future__ import annotations

import logging
import re
from datetime import datetime, timedelta
from pathlib import Path
from typing import Any, NamedTuple
from urllib.error import HTTPError, URLError
from urllib.request import Request, urlopen

from bs4 import BeautifulSoup, Tag
from cdp_backend.database.constants import (
    EventMinutesItemDecision,
    MatterStatusDecision,
    VoteDecision,
)
from cdp_backend.pipeline.ingestion_models import (
    Body,
    EventIngestionModel,
    EventMinutesItem,
    Matter,
    MinutesItem,
    Person,
    Session,
    SupportingFile,
    Vote,
)

from ..scraper_utils import (
    IngestionModelScraper,
    parse_static_file,
    reduced_list,
    str_simplified,
)

###############################################################################

log = logging.getLogger(__name__)

###############################################################################

SCRAPER_STATIC_DATA = parse_static_file(
    Path(__file__).parent / "portland-static.json", "America/Los_Angeles"
)

###############################################################################

MATTER_ADOPTED_PATTERNS = [
    "accepted",
    "passed",
    "adopted",
    "confirmed",
]
MATTER_IN_PROG_PATTERNS = [
    "passed to",
    "placed on",
    "continued",
    "referred",
]

MINUTE_ITEM_PASSED_PATTERNS = [
    # NOTE: these words while have positive conotation,
    # does not mean the legistation was passed.
    # it indicates the item (or a report, etc.) was accepted to be discussed and voted.
    # "accepted",
    # "confirmed",
    # "adopted",
    "passed$",
]

###############################################################################


[docs] class WebPageSoup(NamedTuple): status: bool soup: BeautifulSoup | None = None
[docs] def load_web_page(url: str | Request) -> WebPageSoup: """ Load web page at url and return content soupified. Parameters ---------- url: str | urllib.request.Request Web page to load Returns ------- result: WebPageSoup WebPageSoup.status = False if web page at url could not be loaded """ try: with urlopen(url) as resp: return WebPageSoup(True, BeautifulSoup(resp.read(), "html.parser")) except URLError or HTTPError as e: log.error(f"Failed to open {url}: {str(e)}") return WebPageSoup(False)
[docs] def make_efile_url(efile_page_url: str) -> str: """ Helper function to get file download link on a Portland EFile hosting web page. Parameters ---------- efile_page_url: str URL to Portland efile hosting web page e.g. https://efiles.portlandoregon.gov/record/14803529 Returns ------- efile url: str URL to the file itself e.g. https://efiles.portlandoregon.gov/record/14803529/File/Document """ if not efile_page_url.endswith("/"): efile_page_url += "/" return f"{efile_page_url}File/Document"
[docs] def get_disposition(minute_section: Tag) -> str: """ Return disposition string given within minute_section <div> on the event web page. Parameters ---------- minute_section: Tag <div> within event web page for a given event minute item Returns ------- disposition: str Disposition string for the event minute item e.g. Accepted, Passed, Placed on file """ def _get_disposition(keyword: str): result_status_element_sibling = minute_section.find( "div", text=re.compile(keyword), attrs={"class": "field__label"} ) result_status_element = ( result_status_element_sibling.next_sibling if result_status_element_sibling is not None else None ) return result_status_element.text if result_status_element is not None else None disposition = _get_disposition("Disposition") disposition = disposition or _get_disposition("Council action") return disposition or ""
[docs] def disposition_to_minute_decision( disposition: str, ) -> EventMinutesItemDecision | None: """ Decide EventMinutesItemDecision constant from event minute item disposition. Parameters ---------- disposition: str Disposition event web page for a given item e.g. Passed, Continued Returns ------- decision: Optional[EventMinutesItemDecision] See Also -------- MINUTE_ITEM_PASSED_PATTERNS """ for pattern in MINUTE_ITEM_PASSED_PATTERNS: if re.search(pattern, disposition, re.I): return EventMinutesItemDecision.PASSED return None
[docs] def separate_name_from_title(title_and_name: str) -> str: """ Return just name. Parameters ---------- title_and_name: str e.g. Mayor Ted Wheeler Returns ------- name: str tile_name_name with all title-related words removed e.g. Ted Wheeler """ # title_and_name: # The title (Mayor of Commissioner) and name of a Portland City Commission # member. # e.g., Mayor Ted Wheeler, Commissioner Carmen Rubio, # Former Commissioner Commissioner Jo Ann Hardesty while ( "Former" in title_and_name or "Mayor" in title_and_name or "Commissioner" in title_and_name ): name_index = title_and_name.find(" ") title_and_name = title_and_name[name_index + 1 :] return title_and_name.strip()
[docs] class PortlandScraper(IngestionModelScraper): def __init__(self): super().__init__(timezone="America/Los_Angeles")
[docs] def get_person(self, name: str) -> Person: """ Return matching Person from portland-static.json. Parameters ---------- name: str Person full name Returns ------- person: Person Matching Person from portland-static.json Raises ------ KeyError If name does not exist in portland-static.json References ---------- portland-static.json """ if name not in SCRAPER_STATIC_DATA.persons: raise KeyError(f"{name} is unknown. Please update portland-static.json") return SCRAPER_STATIC_DATA.persons[name]
[docs] def get_doc_number(self, minute_section: Tag, event_page: BeautifulSoup) -> str: """ Find the document number in the minute_section. Parameters ---------- minute_section: Tag <div> within event web page for a given event minute item event_page: BeautifulSoup The entire page where the event is found Returns ------- doc_number: str The document number in the minute_section If this is null, return the section top number with the year """ # Find document number doc_number_element_sibling = minute_section.find( "div", text=re.compile("Document number"), attrs={"class": "field__label"} ) # If there is no document number, skip this minute item if doc_number_element_sibling is None: return self.get_section_top_number(minute_section, event_page) doc_number_element = doc_number_element_sibling.next_sibling doc_number = doc_number_element.find("div", class_="field__item").text.strip() return doc_number
[docs] def get_section_top_number( self, minute_section: Tag, event_page: BeautifulSoup ) -> str: """ Find the top section number in the minute_section. Parameters ---------- minute_section: Tag <div> within event web page for a given event minute item event_page: BeautifulSoup The entire page where the event is found Returns ------- doc_number: str The top section number in the minute_section, with the year appended at the end """ agenda_name = event_page.find("title").text.strip() base_minute_section = minute_section.find("h4").text.strip() if agenda_name is not None: return ( base_minute_section + "-" + agenda_name[agenda_name.index(",") + 2 : agenda_name.index(",") + 6] ) return base_minute_section
[docs] def get_matter( self, minute_section: Tag, event_page: BeautifulSoup ) -> Matter | None: """ Make Matter from information in minute_section. Parameters ---------- minute_section: Tag <div> within event web page for a given event minute item event_page: BeautifulSoup The entire page where the event is found Returns ------- matter: Optional[Matter] Matter if required information could be parsed from minute_section """ # Find title title_div = minute_section.find("div", class_="council-document__title") matter_type = None matter_title = None if title_div is not None: matter_title = title_div.find("a").text.strip() # Find type title_div.find("a").clear() matter_type = title_div.text.strip() if matter_type[0] == "(" and matter_type[-1] == ")": matter_type = matter_type[1:-1] else: matter_title = ( minute_section.find("div", class_="field--name-field-disposition-notes") .children.__next__() .text.strip() ) matter_type = matter_title[ matter_title.rindex("(") + 1 : matter_title.rindex(")") ] matter_title = matter_title[0 : matter_title.rindex("(") - 1] # Find result status result_status = get_disposition(minute_section) # strings like "passed to second reading" is better to catch # before searching for "passed". # so test for IN_PROGRESS first. for pattern in MATTER_IN_PROG_PATTERNS: if re.search(pattern, result_status, re.I): result_status = MatterStatusDecision.IN_PROGRESS break else: for pattern in MATTER_ADOPTED_PATTERNS: if re.search(pattern, result_status, re.I): result_status = MatterStatusDecision.ADOPTED break else: result_status = None # Find the sponsors sponsor_element_uncle = minute_section.find( "div", text=re.compile("Introduced by"), attrs={"class": "field__label"} ) sponsor_list = None if sponsor_element_uncle is not None: sponsor_element_parent = sponsor_element_uncle.next_sibling sponsor_elements = sponsor_element_parent.find_all( "div", class_="field__item" ) sponsor_list = reduced_list( [ self.get_person( separate_name_from_title(sponsor_element.text.strip()) ) for sponsor_element in sponsor_elements ] ) return self.get_none_if_empty( Matter( matter_type=matter_type, name=self.get_doc_number(minute_section, event_page), sponsors=sponsor_list, title=matter_title, result_status=result_status, ), )
[docs] def get_supporting_files(self, minute_section: Tag) -> list[SupportingFile] | None: """ Return SupportingFiles for a given EventMinutesItem. Parameters ---------- minute_section: Tag <div> within event web page for a given event minute item Returns ------- supporting files: Optional[List[SupportingFile]] Notes ----- Follow hyperlink to go to minutes item details page. On the details page look for directly-linked files and externally-hosted efiles. See Also -------- make_efile_url """ try: # on the event page, event minute item titles are listed # in <div> with a particular class attribute. # so look for the minute_item_index-th such <div> on the event page div = minute_section.find( "div", class_="field--label-hidden council-document__title" ) # <a href="/council/documents/communication/placed-file/295-2021"> details_url = f'https://www.portland.gov{div.find("a")["href"]}' except (AttributeError, TypeError): # minute_section.find() or div.find() failed return None # load the mintues item details page that may have links to supporting files details_soup = load_web_page(details_url) if not details_soup.status: return None supporting_files: list[SupportingFile] = [] # first, try to get Documents and Exhibits and Impact Statement # these will contain links to files for div in details_soup.soup.find_all( "div", class_=re.compile( "field field--label-above field--name-field-" "((documents-and-exhibits)|(file-impact-statement)) field--type-file" ), ): supporting_files.extend( [ self.get_none_if_empty( SupportingFile( name=str_simplified( re.sub( r"\s*download\s+file\s*", "", link.text, flags=re.IGNORECASE, ) ), uri=f'https://www.portland.gov{link["href"]}', ) ) # <a href="/sites/...pdf"><span>Download file</span> # <i class="fas fa-file-alt"></i>Exhibit A</a> for link in div.find_all("a") ] ) # finally parse for efile links # these are hosted yet on another web page; always start with https://efiles supporting_files.extend( [ self.get_none_if_empty( SupportingFile( name=str_simplified(link.string), uri=make_efile_url(link["href"]), ) ) for link in details_soup.soup.find_all( "a", href=re.compile(r"https:\/\/efiles.+") ) ] ) # remove any Nones return reduced_list(supporting_files)
[docs] def get_votes(self, minute_section: Tag) -> list[Vote] | None: """ Look for 'Votes:' in minute_section and create a Vote object for each line. Parameters ---------- minute_section: Tag <div> within event web page for a given event minute item Returns ------- votes: Optional[List[Vote]] Votes for corresponding event minute item if found """ vote_element_uncle = minute_section.find( "div", text=re.compile("Votes"), attrs={"class": "field__label"} ) if vote_element_uncle is None: return None vote_element_parent = vote_element_uncle.next_sibling vote_elements = vote_element_parent.find_all("div", class_="relation--type-") vote_list = [] for vote_element in vote_elements: vote = vote_element.text.strip() # at this point vote string is like # Commissioner Jo Ann Hardesty Absent # Commissioner Mingus Mapps Yea is_absent = "absent" in vote.lower() vote = re.sub("absent", "", vote, flags=re.I) if "yea" in vote.lower(): vote = re.sub("yea", "", vote, flags=re.I) decision = VoteDecision.APPROVE if is_absent: decision = VoteDecision.ABSENT_APPROVE elif "nay" in vote.lower(): vote = re.sub("nay", "", vote, flags=re.I) decision = VoteDecision.REJECT if is_absent: decision = VoteDecision.ABSENT_REJECT elif is_absent: decision = VoteDecision.ABSENT_NON_VOTING else: decision = None # at this point any decision token like yea has been removed from vote name = separate_name_from_title(vote.strip()) vote_list.append( self.get_none_if_empty( Vote(decision=decision, person=self.get_person(name)) ) ) return reduced_list(vote_list)
[docs] def get_event_minutes( self, event_page: BeautifulSoup ) -> list[EventMinutesItem] | None: """ Make EventMinutesItem from each relation--type-agenda-item <div> on event_page. Parameters ---------- event_page: BeautifulSoup Web page for the meeting loaded as a bs4 object Returns ------- event minute items: Optional[List[EventMinutesItem]] """ minute_sections = event_page.find_all( "div", class_="relation--type-agenda-item" ) event_minute_items = [] for minute_section in minute_sections: matter = self.get_matter(minute_section, event_page) if matter is not None: minute_name = self.get_doc_number(minute_section, event_page) if minute_name is None: minute_name = self.get_doc_number(minute_section, event_page) minutes_item = self.get_none_if_empty( MinutesItem(name=minute_name, description=matter.title) ) else: minutes_item = None event_minute_items.append( self.get_none_if_empty( EventMinutesItem( decision=disposition_to_minute_decision( get_disposition(minute_section) ), matter=matter, minutes_item=minutes_item, supporting_files=self.get_supporting_files(minute_section), votes=self.get_votes(minute_section), ) ) ) return reduced_list(event_minute_items)
[docs] def get_sessions(self, event_page: BeautifulSoup) -> list[Session] | None: """ Parse meeting video URIs from event_page, return Session for each video found. Parameters ---------- event_page: BeautifulSoup Web page for the meeting loaded as a bs4 object Returns ------- sessions: Optional[List[Session]] Session for each video found on event_page """ # each session's meta data is given in <div class="session-meta"> # including youtube video url for the session, if available # <div class="session-meta"> # ... # <time class="datetime">Wednesday, December 15, 2021 9:30 am</time> # ... # <iframe src="https://www.youtube.com/..."> sessions: list[Session] = [] session_index = 0 for session_div in event_page.find_all("div", class_="session-meta"): session_time = session_div.find("time", class_="datetime") # plenty of sessions have no video listed so must check. # recall we require video_uri for a valid Session. video_iframe = session_div.find("iframe", src=re.compile(".*youtube.*")) if session_time and video_iframe: sessions.append( self.get_none_if_empty( Session( session_datetime=self.localize_datetime( datetime.strptime( session_time.string, "%A, %B %d, %Y %I:%M %p", ) ), session_index=session_index, video_uri=video_iframe["src"].split("?")[0], ) ) ) session_index += 1 return reduced_list(sessions)
[docs] def get_agenda_uri(self, event_page: BeautifulSoup) -> str | None: """ Find the uri for the file containing the agenda for a Portland, OR city council meeting. Parameters ---------- event_page: BeautifulSoup Web page for the meeting loaded as a bs4 object Returns ------- agenda_uri: Optional[str] The uri for the file containing the meeting's agenda """ agenda_uri_element = event_page.find( "a", text=re.compile("Disposition Agenda"), attrs={"class": "btn-cta"} ) if agenda_uri_element is not None: return make_efile_url(agenda_uri_element["href"]) parent_agenda_uri_element = event_page.find("div", {"class": "inline-flex"}) if parent_agenda_uri_element is not None: agenda_uri_element = parent_agenda_uri_element.find("a") else: return None if agenda_uri_element is not None: return f"https://www.portland.gov{agenda_uri_element['href']}" return None
[docs] def get_event(self, event_time: datetime) -> EventIngestionModel | None: """ Portland, OR city council meeting information for a specific date. Parameters ---------- event_time: datetime Meeting date Returns ------- Optional[EventIngestionModel] None if there was no meeting on event_time or information for the meeting did not meet minimal CDP requirements. """ # try to load https://www.portland.gov/council/agenda/yyyy/m/d event_page = load_web_page( "https://www.portland.gov/council/agenda/" # we actually DON'T want to use strftime() because we must not zero-pad # e.g. for 2022/01/05, we MUST use 2022/1/5 f"{event_time.year}/{event_time.month}/{event_time.day}" ) if not event_page.status: # no meeting on requested day return None return self.get_none_if_empty( EventIngestionModel( agenda_uri=self.get_agenda_uri(event_page.soup), # NOTE: have not seen any specific body/bureau named on any agenda page body=Body(name="City Council"), event_minutes_items=self.get_event_minutes(event_page.soup), minutes_uri=None, sessions=self.get_sessions(event_page.soup), ), )
[docs] def get_events( self, begin: datetime | None = None, end: datetime | None = None, ) -> list[EventIngestionModel]: """ Portland, OR city council meeting information over given time span as List[EventIngestionModel]. Parameters ---------- begin: datetime, optional The timespan beginning datetime to query for events after. Default is 2 days from UTC now end: datetime, optional The timespan end datetime to query for events before. Default is UTC now Returns ------- events: List[EventIngestionModel] References ---------- https://www.portland.gov/council/agenda/all """ if begin is None: begin = datetime.utcnow() - timedelta(days=2) if end is None: end = datetime.utcnow() return reduced_list( [ self.get_event(begin + timedelta(days=day)) for day in range((end - begin).days) ], # prefer to return [] over None to backend pipeline # for easier iterate there collapse=False, )
[docs] def get_portland_events( from_dt: datetime | None = None, to_dt: datetime | None = None, **kwargs: Any, ) -> list[EventIngestionModel]: """ Public API for use in instances.__init__ so that this func can be attached as an attribute to cdp_scrapers.instances module. Thus the outside world like cdp-backend can get at this by asking for "get_portland_events". Parameters ---------- from_dt: datetime, optional The timespan beginning datetime to query for events after. Default is 2 days from UTC now to_dt: datetime, optional The timespan end datetime to query for events before. Default is UTC now kwargs: Any Any extra keywords arguments to pass to the get events function. Returns ------- events: List[EventIngestionModel] See Also -------- cdp_scrapers.instances.__init__.py """ scraper = PortlandScraper() return scraper.get_events(begin=from_dt, end=to_dt, **kwargs)