Source code for whisper_experiments.diff

#!/usr/bin/env python
# -*- coding: utf-8 -*-

from itertools import filterfalse
from typing import (
    Callable,
    Iterable,
    Iterator,
    List,
    NamedTuple,
    Optional,
    Tuple,
    Union,
)

import rapidfuzz
import text_diff
from text_diff import AddedLine, ModifiedLine, RemovedLine, UnchangedLine

###############################################################################

# Aliases to just indicate the object refers to a word
ModifiedWord = ModifiedLine
RemovedWord = RemovedLine
AddedWord = AddedLine
Word = Union[ModifiedLine, RemovedLine, AddedLine]
Line = Union[ModifiedLine, RemovedLine, AddedLine]
TextType = Union[Word, Line]

###############################################################################


[docs]class TextDiff: """ Wrapper for text_diff objects e.g. ModifiedLine. """ def __init__(self, text_diff: TextType): """ Parameters ---------- text_diff: TextType The text_diff object to wrap """ self.text_diff = text_diff def __eq__(self, other: object) -> bool: # Mypy typing tries it's hardest: # https://stackoverflow.com/a/54816069 if not isinstance(other, TextDiff): raise NotImplementedError( "TextDiff can only assert equals when provided " "another TextDiff object." ) return ( self.is_removed == other.is_removed and self.is_added == other.is_added and self.is_modified == other.is_modified and self.content == other.content and self.content_before == other.content_before and self.content_after == other.content_after ) def __str__(self) -> str: if self.is_modified: return f"Modified: {self.content_before} -> {self.content_after}" if self.is_removed: return f"Removed: {self.content}" return f"Added: {self.content}" @property def is_removed(self) -> bool: """ Returns ------- bool True if wrapped text is removed from left version to right """ return isinstance(self.text_diff, RemovedLine) @property def is_added(self) -> bool: """ Returns ------- bool True if wrapped text is added from left version to right """ return isinstance(self.text_diff, AddedLine) @property def is_modified(self) -> bool: """ Returns ------- bool True if wrapped text is modified from left version to right """ return isinstance(self.text_diff, ModifiedLine) @property def content(self) -> Optional[str]: """ Returns ------- Optional[str] The wrapped text. None if is_modified() == True. Notes ----- If wrapped text is modified, it cannot be decided which content to return, i.e. the left or the right version. """ return getattr(self.text_diff, "content", None) @property def content_before(self) -> Optional[str]: """ Returns ------- Optional[str] The left version of the wrapped text. None if is_added() == True """ if self.is_removed: return self.content return getattr(self.text_diff, "content_before", None) @property def content_after(self) -> Optional[str]: """ Returns ------- Optional[str] The right version of the wrapped text. None if is_removed() == True """ if self.is_added: return self.content return getattr(self.text_diff, "content_after", None)
[docs]class LineComparison(NamedTuple): # Union[ModifiedLine, RemovedLine, AddedLine] wrapped in TextDiff line: TextDiff # List of different words in this line words: List[TextDiff] def __str__(self) -> str: words_str = ", ".join(map(str, self.words)) return f" line: {self.line}\n" f" words: [{words_str}]\n"
[docs]class TextComparison(NamedTuple): # Similarity score between the left and the right text blobs similarity: float # List of different lines lines: List[LineComparison] def __eq__(self, other: object) -> bool: # Mypy typing tries it's hardest: # https://stackoverflow.com/a/54816069 if not isinstance(other, TextComparison): raise NotImplementedError( "TextComparison can only assert equals when provided " "another TextComparison object." ) return self.similarity == other.similarity and self.lines == other.lines def __str__(self) -> str: lines_str = "\n".join(map(str, self.lines)) return f"similarity: {self.similarity}\n" f"lines: [\n{lines_str}\n]"
def _is_unchanged(text_diff: TextType) -> bool: """ Helper function to filter out text does not change from left to right. """ return isinstance(text_diff, UnchangedLine)
[docs]def word_differences( words_1: Iterable[str], words_2: Iterable[str] ) -> Iterator[TextDiff]: """ Return list of removed/added/modified words bewteen the given lists of words. Parameters ---------- words_1: Iterable[str] Left list of words words_2: Iterable[str] Right list of words Returns ------- TextDiff Union[RemovedWord, AddedWord, ModifiedWord] wrapped as TextDiff See Also -------- text_differences Calculate similarity of the texts and all line differences. Notes ----- Unchanged words are excluded. """ diff_words = text_diff.text_differences(words_1, words_2).diff_lines diff_words = filterfalse(_is_unchanged, diff_words) return map(TextDiff, diff_words)
[docs]def line_differences( lines_1: Iterable[str], lines_2: Iterable[str], word_split_func: Callable[[str], Iterable[str]] = str.split, ) -> Iterator[LineComparison]: """ Return list of removed/added/modified lines. Parameters ---------- lines_1: Iterable[str] Left list of lines lines_2: Iterable[str] Right list of lines word_split_func: Callable[[str], Iterable[str]] Function used to split a line into words. Default is str.split() Yields ------ LineComparison Union[RemovedLine, AddedLine, ModifiedLine] wrapped as TextDiff List of different words in the line See Also -------- text_differences Calculate similarity of the texts and all line differences. word_differences Calculate just word differences. Notes ----- Unchanged lines are excluded. """ def increment_line_num( line: Line, line_index_1: int, line_index_2: int ) -> Tuple[int, int]: """ Return incremented line numbers. """ if isinstance(line, (UnchangedLine, ModifiedLine)): return line_index_1 + 1, line_index_2 + 1 if isinstance(line, AddedLine): return line_index_1, line_index_2 + 1 if isinstance(line, RemovedLine): return line_index_1 + 1, line_index_2 raise TypeError def get_words_to_compare(line: Line) -> Tuple[Iterable[str], Iterable[str]]: """ Make lists of words from this line """ if isinstance(line, ModifiedLine): # line is modified, so split both left and right versions of the line # in order to compare the words return word_split_func(line.content_before), word_split_func( line.content_after ) words = word_split_func(line.content) if isinstance(line, RemovedLine): # Line is removed from left to right # so list of words is empty in the right version. return words, list() if isinstance(line, AddedLine): # Line is added from left to right # so list of words is empty in the left version. return list(), words return words, words line_num_1 = 0 line_num_2 = 0 # diff_line is RemovedLine, AddedLine, ModifiedLine, or UnchangedLine for diff_line in text_diff.text_differences(lines_1, lines_2).diff_lines: line_num_1, line_num_2 = increment_line_num(diff_line, line_num_1, line_num_2) # Same lines are excluded if not _is_unchanged(diff_line): words_1, words_2 = get_words_to_compare(diff_line) yield LineComparison( TextDiff(diff_line), list(word_differences(words_1, words_2)) )
[docs]def text_differences( text_1: str, text_2: str, similarity_calc: Callable[[str, str], float] = rapidfuzz.fuzz.QRatio, word_split_func: Callable[[str], Iterable[str]] = str.split, ) -> TextComparison: """ Compare left and right text blobs. They are first compared as lines, then as words for the changed lines. Parameters ---------- text_1: str Left text text_2: str Right text similarity_calc: Callable[[str, str], float] Function used to calculate similarity score. Default is rapidfuzz.fuzz.QRatio() word_split_func: Callable[[str], Iterable[str]] Function used to split a line into words. Default is str.split() Returns ------- TextComparison Similarity score List of different lines See Also -------- line_differences Calculate just line differences. Notes ----- Unchanged lines are excluded. """ return TextComparison( similarity_calc(text_1, text_2), list( line_differences(text_1.splitlines(), text_2.splitlines(), word_split_func) ), )