Source code for cdp_backend.utils.string_utils

##!/usr/bin/env python

import logging
import re
import string

###############################################################################

log = logging.getLogger(__name__)

###############################################################################


[docs] def remove_emojis(text: str) -> str: """ Minor changes made from this answer on stackoverflow: https://stackoverflow.com/a/58356570. """ emoj_patterns = re.compile( "[" "\U0001F600-\U0001F64F" # emoticons "\U0001F300-\U0001F5FF" # symbols & pictographs "\U0001F680-\U0001F6FF" # transport & map symbols "\U0001F1E0-\U0001F1FF" # flags (iOS) "\U00002702-\U000027B0" "\U000024C2-\U0001F251" "\U0001f926-\U0001f937" "\u2600-\u2B55" "\u200d" "\u23cf" "\u23e9" "\u231a" "\ufe0f" # dingbats "\u3030" "]+", re.UNICODE, ) return re.sub(emoj_patterns, "", text)
[docs] def clean_text( text: str, clean_stop_words: bool = False, clean_emojis: bool = False, ) -> str: """ Clean text of common characters and extra formatting. Parameters ---------- text: str The raw text to clean. clean_stop_words: bool Should English stop words be removed from the raw text or not. Default: False (do not remove stop words) clean_emojis: bool Should emojis, emoticons, pictograms, and other characters be removed. Default: False (do not remove pictograms) Returns ------- cleaned_text: str The cleaned text. """ # Remove new line and tab characters cleaned_formatting = text.replace("\n", " ").replace("\t", " ") # Replace common sentence structures cleaned_sentence_structs = cleaned_formatting.replace("--", " ") # Remove punctuation except periods cleaned_punctuation = re.sub( f"[{re.escape(string.punctuation)}]", "", cleaned_sentence_structs ) # Remove stopwords if clean_stop_words: # Ensure stopwords are downloaded try: from nltk.corpus import stopwords stopwords_vocab = stopwords.words("english") except LookupError: import nltk nltk.download("stopwords") log.info("Downloaded nltk stopwords") from nltk.corpus import stopwords stopwords_vocab = stopwords.words("english") joined_stopwords = "|".join(stopwords_vocab) cleaned_text = re.sub( r"\b(" + joined_stopwords + r")\b", "", cleaned_punctuation, ) else: # Update for mypy typing cleaned_text = cleaned_punctuation # Remove pictograms if clean_emojis: cleaned_text = remove_emojis(cleaned_text) # Remove gaps in string try: cleaned_doc = re.sub(r" {2,}", " ", cleaned_text) cleaned_doc = cleaned_doc.strip() # IndexError occurs when the string was cleaned and it contained entirely stop # words or punctuation for some reason except IndexError: return "" return cleaned_doc
[docs] def convert_gcs_json_url_to_gsutil_form(url: str) -> str: """ Convert a GCS JSON API url to its corresponding gsutil uri. Parameters ---------- url: str The url in GCS JSON API form. Returns ------- gsutil_url: str The url in gsutil form. Returns empty string if the input url doesn't match the form. """ found_bucket, found_filename = None, None bucket = re.search("storage.googleapis.com/download/storage/v1/b/(.+?)/o", url) if bucket: found_bucket = str(bucket.group(1)) filename = re.search(r"/o/(.+?)\?alt=media", url) if filename: found_filename = str(filename.group(1)) if found_bucket and found_filename: return f"gs://{found_bucket}/{found_filename}" return ""