whisper_experiments package#

Subpackages#

Submodules#

whisper_experiments.data module#

class whisper_experiments.data.FullDatasetFields[source]#

Bases: object

audio_uri = 'audio_uri'#
ground_truth_transcript_path = 'ground_truth_transcript_path'#
gsr_transcript_path = 'gsr_transcript_path'#
gsr_transcription_time = 'gsr_transcription_time'#
id_ = 'id'#
key = 'key'#
session_content_hash = 'session_content_hash'#
session_datetime = 'session_datetime'#
session_index_in_event = 'session_index_in_event'#
class whisper_experiments.data.GroundTruthDatasetFields[source]#

Bases: object

audio_uri = 'audio_uri'#
ground_truth_transcript_path = 'ground_truth_transcript_path'#
id_ = 'id'#
key = 'key'#
session_content_hash = 'session_content_hash'#
session_datetime = 'session_datetime'#
session_index_in_event = 'session_index_in_event'#
whisper_experiments.data.get_ground_truth_dataset(test: bool = False) DataFrame[source]#

Get the dataset we are using for testing Google Speech-to-Text and Whisper.

Parameters:
test: bool

If true, get a smaller, 5 file, test set. Default: False (use the full ~50 file dataset)

Returns:
pd.DataFrame

DataFrame returned from cdp_data.datasets.get_session_dataset. Additional “audio_uri” column is added.

See also

cdp_data.dataset.get_session_dataset

The primary function this function wraps.

whisper_experiments.data.load_cdp_whisper_experiment_data(storage_dir: Path = PosixPath('cdp-whisper-experiments-data')) DataFrame[source]#

Load the archived and packaged data shipped with this library back into a pandas DataFrame with transcript paths fully resolved.

Will empty the provided storage_dir prior to unpacking.

whisper_experiments.diff module#

class whisper_experiments.diff.LineComparison(line, words)[source]#

Bases: NamedTuple

Create new instance of LineComparison(line, words)

line: TextDiff#

Alias for field number 0

words: List[TextDiff]#

Alias for field number 1

class whisper_experiments.diff.TextComparison(similarity, lines)[source]#

Bases: NamedTuple

Create new instance of TextComparison(similarity, lines)

lines: List[LineComparison]#

Alias for field number 1

similarity: float#

Alias for field number 0

class whisper_experiments.diff.TextDiff(text_diff: Union[ModifiedLine, RemovedLine, AddedLine])[source]#

Bases: object

Wrapper for text_diff objects e.g. ModifiedLine.

Parameters:
text_diff: TextType

The text_diff object to wrap

property content: Optional[str]#
Returns:
Optional[str]

The wrapped text. None if is_modified() == True.

Notes

If wrapped text is modified, it cannot be decided which content to return, i.e. the left or the right version.

property content_after: Optional[str]#
Returns:
Optional[str]

The right version of the wrapped text. None if is_removed() == True

property content_before: Optional[str]#
Returns:
Optional[str]

The left version of the wrapped text. None if is_added() == True

property is_added: bool#
Returns:
bool

True if wrapped text is added from left version to right

property is_modified: bool#
Returns:
bool

True if wrapped text is modified from left version to right

property is_removed: bool#
Returns:
bool

True if wrapped text is removed from left version to right

whisper_experiments.diff.line_differences(lines_1: ~typing.Iterable[str], lines_2: ~typing.Iterable[str], word_split_func: ~typing.Callable[[str], ~typing.Iterable[str]] = <method 'split' of 'str' objects>) Iterator[LineComparison][source]#

Return list of removed/added/modified lines.

Parameters:
lines_1: Iterable[str]

Left list of lines

lines_2: Iterable[str]

Right list of lines

word_split_func: Callable[[str], Iterable[str]]

Function used to split a line into words. Default is str.split()

Yields:
LineComparison

Union[RemovedLine, AddedLine, ModifiedLine] wrapped as TextDiff List of different words in the line

See also

text_differences

Calculate similarity of the texts and all line differences.

word_differences

Calculate just word differences.

Notes

Unchanged lines are excluded.

whisper_experiments.diff.text_differences(text_1: str, text_2: str, similarity_calc: ~typing.Callable[[str, str], float] = <cyfunction QRatio>, word_split_func: ~typing.Callable[[str], ~typing.Iterable[str]] = <method 'split' of 'str' objects>) TextComparison[source]#

Compare left and right text blobs. They are first compared as lines, then as words for the changed lines.

Parameters:
text_1: str

Left text

text_2: str

Right text

similarity_calc: Callable[[str, str], float]

Function used to calculate similarity score. Default is rapidfuzz.fuzz.QRatio()

word_split_func: Callable[[str], Iterable[str]]

Function used to split a line into words. Default is str.split()

Returns:
TextComparison

Similarity score List of different lines

See also

line_differences

Calculate just line differences.

Notes

Unchanged lines are excluded.

whisper_experiments.diff.word_differences(words_1: Iterable[str], words_2: Iterable[str]) Iterator[TextDiff][source]#

Return list of removed/added/modified words bewteen the given lists of words.

Parameters:
words_1: Iterable[str]

Left list of words

words_2: Iterable[str]

Right list of words

Returns:
TextDiff

Union[RemovedWord, AddedWord, ModifiedWord] wrapped as TextDiff

See also

text_differences

Calculate similarity of the texts and all line differences.

Notes

Unchanged words are excluded.

whisper_experiments.model module#

class whisper_experiments.model.GSRTranscribeParams(row: pandas.core.series.Series, credentials_file: str, storage_dir: pathlib.Path)[source]#

Bases: object

credentials_file: str#
row: Series#
storage_dir: Path#
whisper_experiments.model.generate_google_sr_dataset(sessions: DataFrame, credentials_file: str, storage_dir: Path = PosixPath('gsr-transcripts')) DataFrame[source]#

Process the audio files from the dataset with Google Speech-to-Text.

Parameters:
sessions: pd.DataFrame

The source dataset to use for processing.

credentials_file: str

The path to the Google Service Account Credentials JSON for the processing account / project.

storage_dir: Path

The path to a directory to store the generated transcripts in. Default: gsr-transcripts/

Returns:
pd.DataFrame

The same session dataset with GSR transcription columns added. Note: the rows may be in different order due to threading.

See also

whisper_experiments.data.get_ground_truth_dataset

The data that should be provided to this function.

Module contents#

Top-level package for whisper_experiments.