Source code for cdp_backend.pipeline.transcript_model

#!/usr/bin/env python

from dataclasses import dataclass
from datetime import datetime
from typing import List, Optional

from dataclasses_json import DataClassJsonMixin

###############################################################################
# Annotation Definitions
#
# Developers please document any annotation you wish to add.
# Docstrings for constants go _after_ the constant.


[docs] @dataclass class WordAnnotations(DataClassJsonMixin): """Annotations that can appear on an individual word level."""
[docs] @dataclass class SentenceAnnotations(DataClassJsonMixin): """Annotations that can appear on an individual sentence level."""
[docs] @dataclass class SectionAnnotation(DataClassJsonMixin): """ A section annotation used for topic segmentation and minutes item alignment. Parameters ---------- name: str The name of the sections. start_sentence_index: int The sentence index that acts as the starting point for the section. stop_sentence_index: int The sentence index that acts as the stopping point for the section. generator: str A description of the algorithm or annotator that provided this annotation. description: Optional[str] An optional description of what the section is about. Default: None Notes ----- The attributes `start_sentence_index` and `stop_sentence_index` should be treated as inclusive and exclusive respectively, exactly like how the Python `slice` function works. I.e. given a transcript of ordered sentences, the sentence indices will work as the parameters for a slice against the list of sentences: `sentences[start_sentence_index:stop_sentence_index]` Examples -------- Usage pattern for annotation attachment. >>> transcript.annotations.sections = [ ... SectionAnnotation( ... name="Public Comment", ... start_sentence_index=12, ... stop_sentence_index=87, ... generator="Eva Maxfield Brown", ... ), ... SectionAnnotation( ... name="CB 120121", ... start_sentence_index=243, ... stop_sentence_index=419, ... description="AN ORDINANCE relating to land use and zoning ...", ... generator="queue-cue--v1.0.0", ... ), ... ] """ name: str start_sentence_index: int stop_sentence_index: Optional[int] generator: str description: Optional[str] = None
[docs] @dataclass class TranscriptAnnotations(DataClassJsonMixin): """Annotations that can appear (but are not guaranteed) for the whole transcript.""" sections: Optional[List[SectionAnnotation]] = None
###############################################################################
[docs] @dataclass class Word(DataClassJsonMixin): """ Data for a word in a transcript. Parameters ---------- index: int The index of the word in it's respective sentence. start_time: float Time in seconds for when this word begins. end_time: float Time in seconds for when this word ends. text: str The raw text of the word, lowercased and cleaned of all non-deliminating chars. annotations: Optional[WordAnnotations] Any annotations specific to this word. Default: None (no annotations) """ index: int start_time: float end_time: float text: str annotations: Optional[WordAnnotations] = None
[docs] @dataclass class Sentence(DataClassJsonMixin): """ Data for a sentence in a transcript. Parameters ---------- index: int The index of the sentence in it's respective transcript. confidence: float A number between 0 and 1 for the confidence of the sentence accuracy. start_time: float Time in seconds for when this sentence begins. end_time: float Time in seconds for when this sentence ends. speaker_index: Optional[int] The optional speaker index for the sentence. speaker_name: Optional[str] The optional speaker name for the sentence. annotations: Optional[SentenceAnnotations] Any annotations specific to this sentence. Default: None (no annotations) words: List[Word] The list of word for the sentence. text: str The text of the sentence including all formatting and non-deliminating chars. """ index: int confidence: float start_time: float end_time: float words: List[Word] text: str speaker_index: Optional[int] = None speaker_name: Optional[str] = None annotations: Optional[SentenceAnnotations] = None
[docs] @dataclass class Transcript(DataClassJsonMixin): """ Transcript model for all transcripts in CDP databases / filestores. Parameters ---------- generator: str A descriptive name of the generative process that produced this transcript. Example: "Google Speech-to-Text -- Lib Version: 2.0.1" confidence: float A number between 0 and 1. If available, use the average of all confidence annotations reported for each text block in the transcript. Otherwise, make an estimation for (or manually calculate): `n-correct-tokens / n-total-tokens` for the whole transcript. session_datetime: Optional[str] ISO formatted datetime for the session that this document transcribes. created_datetime: str ISO formatted datetime for when this transcript was created. sentences: List[Sentence] A list of sentences. annotations: Optional[TranscriptAnnotations] Any annotations that can be applied to the whole transcript. Default: None (no annotations) Examples -------- Dumping transcript to JSON file. >>> # transcript = Transcript(...) ... with open("transcript.json", "w") as open_resource: ... open_resource.write(transcript.to_json()) Reading transcript from JSON file. >>> with open("transcript.json", "r") as open_resource: ... transcript = Transcript.from_json(open_resource.read()) """ generator: str confidence: float session_datetime: Optional[str] created_datetime: str sentences: List[Sentence] annotations: Optional[TranscriptAnnotations] = None def __repr__(self) -> str: """Print out shortform transcript details.""" output = "Transcript(" # Use vars to maintain subclassing for k, v in vars(self).items(): # Truncate sentences if k == "sentences": output += f"{k}=[...] (n={len(v)}), " # Add quotes for strings elif type(v) == str: output += f"{k}='{v}', " else: output += f"{k}={v}, " # Remove last comma and space and close parentheses return output[:-2] + ")"
############################################################################### EXAMPLE_TRANSCRIPT = Transcript( generator="EvaGen -- Lib Version: 0.0.0", confidence=0.93325, session_datetime=datetime(2021, 1, 10, 15).isoformat(), created_datetime=datetime.utcnow().isoformat(), sentences=[ Sentence( index=0, text="Hello everyone.", confidence=0.9, start_time=0.0, end_time=1.0, speaker_name="Eva Maxfield Brown", speaker_index=0, words=[ Word( index=0, start_time=0.0, end_time=0.5, text="hello", ), Word( index=1, start_time=0.5, end_time=1.0, text="everyone", ), ], ), Sentence( index=1, text="Hi all.", confidence=0.95, start_time=1.0, end_time=2.0, speaker_name="Isaac Na", speaker_index=1, words=[ Word( index=0, start_time=1.0, end_time=1.5, text="hi", ), Word( index=1, start_time=1.5, end_time=2.0, text="all", ), ], ), ], annotations=TranscriptAnnotations( sections=[ SectionAnnotation( name="Call to Order", start_sentence_index=0, stop_sentence_index=2, generator="Eva Maxfield Brown", ) ], ), )