##!/usr/bin/env python
import logging
import re
import string
###############################################################################
log = logging.getLogger(__name__)
###############################################################################
[docs]
def remove_emojis(text: str) -> str:
"""
Minor changes made from this answer on stackoverflow:
https://stackoverflow.com/a/58356570.
"""
emoj_patterns = re.compile(
"["
"\U0001F600-\U0001F64F" # emoticons
"\U0001F300-\U0001F5FF" # symbols & pictographs
"\U0001F680-\U0001F6FF" # transport & map symbols
"\U0001F1E0-\U0001F1FF" # flags (iOS)
"\U00002702-\U000027B0"
"\U000024C2-\U0001F251"
"\U0001f926-\U0001f937"
"\u2600-\u2B55"
"\u200d"
"\u23cf"
"\u23e9"
"\u231a"
"\ufe0f" # dingbats
"\u3030"
"]+",
re.UNICODE,
)
return re.sub(emoj_patterns, "", text)
[docs]
def clean_text(
text: str,
clean_stop_words: bool = False,
clean_emojis: bool = False,
) -> str:
"""
Clean text of common characters and extra formatting.
Parameters
----------
text: str
The raw text to clean.
clean_stop_words: bool
Should English stop words be removed from the raw text or not.
Default: False (do not remove stop words)
clean_emojis: bool
Should emojis, emoticons, pictograms, and other characters be removed.
Default: False (do not remove pictograms)
Returns
-------
cleaned_text: str
The cleaned text.
"""
# Remove new line and tab characters
cleaned_formatting = text.replace("\n", " ").replace("\t", " ")
# Replace common sentence structures
cleaned_sentence_structs = cleaned_formatting.replace("--", " ")
# Remove punctuation except periods
cleaned_punctuation = re.sub(
f"[{re.escape(string.punctuation)}]", "", cleaned_sentence_structs
)
# Remove stopwords
if clean_stop_words:
# Ensure stopwords are downloaded
try:
from nltk.corpus import stopwords
stopwords_vocab = stopwords.words("english")
except LookupError:
import nltk
nltk.download("stopwords")
log.info("Downloaded nltk stopwords")
from nltk.corpus import stopwords
stopwords_vocab = stopwords.words("english")
joined_stopwords = "|".join(stopwords_vocab)
cleaned_text = re.sub(
r"\b(" + joined_stopwords + r")\b",
"",
cleaned_punctuation,
)
else:
# Update for mypy typing
cleaned_text = cleaned_punctuation
# Remove pictograms
if clean_emojis:
cleaned_text = remove_emojis(cleaned_text)
# Remove gaps in string
try:
cleaned_doc = re.sub(r" {2,}", " ", cleaned_text)
cleaned_doc = cleaned_doc.strip()
# IndexError occurs when the string was cleaned and it contained entirely stop
# words or punctuation for some reason
except IndexError:
return ""
return cleaned_doc