Source code for cdp_backend.pipeline.transcript_model

#!/usr/bin/env python

from dataclasses import dataclass
from datetime import datetime
from typing import List, Optional

from dataclasses_json import DataClassJsonMixin

###############################################################################
# Annotation Definitions
#
# Developers please document any annotation you wish to add.
# Docstrings for constants go _after_ the constant.



[docs]
@dataclass
class WordAnnotations(DataClassJsonMixin):
    """Annotations that can appear on an individual word level."""




[docs]
@dataclass
class SentenceAnnotations(DataClassJsonMixin):
    """Annotations that can appear on an individual sentence level."""




[docs]
@dataclass
class SectionAnnotation(DataClassJsonMixin):
    """
    A section annotation used for topic segmentation and minutes item alignment.

    Parameters
    ----------
    name: str
        The name of the sections.
    start_sentence_index: int
        The sentence index that acts as the starting point for the section.
    stop_sentence_index: int
        The sentence index that acts as the stopping point for the section.
    generator: str
        A description of the algorithm or annotator that provided this annotation.
    description: Optional[str]
        An optional description of what the section is about.
        Default: None

    Notes
    -----
    The attributes `start_sentence_index` and `stop_sentence_index` should be treated
    as inclusive and exclusive respectively, exactly like how the Python `slice`
    function works.

    I.e. given a transcript of ordered sentences, the sentence indices will work
    as the parameters for a slice against the list of sentences:
    `sentences[start_sentence_index:stop_sentence_index]`

    Examples
    --------
    Usage pattern for annotation attachment.

    >>> transcript.annotations.sections = [
    ...     SectionAnnotation(
    ...         name="Public Comment",
    ...         start_sentence_index=12,
    ...         stop_sentence_index=87,
    ...         generator="Eva Maxfield Brown",
    ...     ),
    ...     SectionAnnotation(
    ...         name="CB 120121",
    ...         start_sentence_index=243,
    ...         stop_sentence_index=419,
    ...         description="AN ORDINANCE relating to land use and zoning ...",
    ...         generator="queue-cue--v1.0.0",
    ...     ),
    ... ]
    """

    name: str
    start_sentence_index: int
    stop_sentence_index: Optional[int]
    generator: str
    description: Optional[str] = None




[docs]
@dataclass
class TranscriptAnnotations(DataClassJsonMixin):
    """Annotations that can appear (but are not guaranteed) for the whole transcript."""

    sections: Optional[List[SectionAnnotation]] = None



###############################################################################



[docs]
@dataclass
class Word(DataClassJsonMixin):
    """
    Data for a word in a transcript.

    Parameters
    ----------
    index: int
        The index of the word in it's respective sentence.
    start_time: float
        Time in seconds for when this word begins.
    end_time: float
        Time in seconds for when this word ends.
    text: str
        The raw text of the word, lowercased and cleaned of all non-deliminating chars.
    annotations: Optional[WordAnnotations]
        Any annotations specific to this word.
        Default: None (no annotations)
    """

    index: int
    start_time: float
    end_time: float
    text: str
    annotations: Optional[WordAnnotations] = None




[docs]
@dataclass
class Sentence(DataClassJsonMixin):
    """
    Data for a sentence in a transcript.

    Parameters
    ----------
    index: int
        The index of the sentence in it's respective transcript.
    confidence: float
        A number between 0 and 1 for the confidence of the sentence accuracy.
    start_time: float
        Time in seconds for when this sentence begins.
    end_time: float
        Time in seconds for when this sentence ends.
    speaker_index: Optional[int]
        The optional speaker index for the sentence.
    speaker_name: Optional[str]
        The optional speaker name for the sentence.
    annotations: Optional[SentenceAnnotations]
        Any annotations specific to this sentence.
        Default: None (no annotations)
    words: List[Word]
        The list of word for the sentence.
    text: str
        The text of the sentence including all formatting and non-deliminating chars.
    """

    index: int
    confidence: float
    start_time: float
    end_time: float
    words: List[Word]
    text: str
    speaker_index: Optional[int] = None
    speaker_name: Optional[str] = None
    annotations: Optional[SentenceAnnotations] = None




[docs]
@dataclass
class Transcript(DataClassJsonMixin):
    """
    Transcript model for all transcripts in CDP databases / filestores.

    Parameters
    ----------
    generator: str
        A descriptive name of the generative process that produced this transcript.
        Example: "Google Speech-to-Text -- Lib Version: 2.0.1"
    confidence: float
        A number between 0 and 1.
        If available, use the average of all confidence annotations reported for each
        text block in the transcript.
        Otherwise, make an estimation for (or manually calculate):
        `n-correct-tokens / n-total-tokens` for the whole transcript.
    session_datetime: Optional[str]
        ISO formatted datetime for the session that this document transcribes.
    created_datetime: str
        ISO formatted datetime for when this transcript was created.
    sentences: List[Sentence]
        A list of sentences.
    annotations: Optional[TranscriptAnnotations]
        Any annotations that can be applied to the whole transcript.
        Default: None (no annotations)

    Examples
    --------
    Dumping transcript to JSON file.

    >>> # transcript = Transcript(...)
    ... with open("transcript.json", "w") as open_resource:
    ...     open_resource.write(transcript.to_json())

    Reading transcript from JSON file.

    >>> with open("transcript.json", "r") as open_resource:
    ...     transcript = Transcript.from_json(open_resource.read())
    """

    generator: str
    confidence: float
    session_datetime: Optional[str]
    created_datetime: str
    sentences: List[Sentence]
    annotations: Optional[TranscriptAnnotations] = None

    def __repr__(self) -> str:
        """Print out shortform transcript details."""
        output = "Transcript("

        # Use vars to maintain subclassing
        for k, v in vars(self).items():
            # Truncate sentences
            if k == "sentences":
                output += f"{k}=[...] (n={len(v)}), "

            # Add quotes for strings
            elif type(v) == str:
                output += f"{k}='{v}', "

            else:
                output += f"{k}={v}, "

        # Remove last comma and space and close parentheses
        return output[:-2] + ")"



###############################################################################


EXAMPLE_TRANSCRIPT = Transcript(
    generator="EvaGen -- Lib Version: 0.0.0",
    confidence=0.93325,
    session_datetime=datetime(2021, 1, 10, 15).isoformat(),
    created_datetime=datetime.utcnow().isoformat(),
    sentences=[
        Sentence(
            index=0,
            text="Hello everyone.",
            confidence=0.9,
            start_time=0.0,
            end_time=1.0,
            speaker_name="Eva Maxfield Brown",
            speaker_index=0,
            words=[
                Word(
                    index=0,
                    start_time=0.0,
                    end_time=0.5,
                    text="hello",
                ),
                Word(
                    index=1,
                    start_time=0.5,
                    end_time=1.0,
                    text="everyone",
                ),
            ],
        ),
        Sentence(
            index=1,
            text="Hi all.",
            confidence=0.95,
            start_time=1.0,
            end_time=2.0,
            speaker_name="Isaac Na",
            speaker_index=1,
            words=[
                Word(
                    index=0,
                    start_time=1.0,
                    end_time=1.5,
                    text="hi",
                ),
                Word(
                    index=1,
                    start_time=1.5,
                    end_time=2.0,
                    text="all",
                ),
            ],
        ),
    ],
    annotations=TranscriptAnnotations(
        sections=[
            SectionAnnotation(
                name="Call to Order",
                start_sentence_index=0,
                stop_sentence_index=2,
                generator="Eva Maxfield Brown",
            )
        ],
    ),
)