NLP Cleaning Pipeline
This post contains my most used snippets in my Natural Language adventures.
Imports
import re
import os
from bs4 import BeautifulSoup as beautifulsoup
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer as wordnetlemmatizer
Lowercase
def lowercase(input):
"""
returns lowercase text
"""
return input.lower()
Remove Punctuation
def remove_punctuation(input):
"""
returns text without punctuation
"""
return input.translate(str.maketrans("", "", string.punctuation + "1234567890"))
Remove White space
def remove_whitespaces(input):
"""
returns text without extra whitespaces
"""
return " ".join(input.split())
Remove Emoji
def remove_emoji(string):
"""
Remove the weird Emojis from the corpus
"""
emoji_pattern = re.compile(
"["
"U0001F600-U0001F64F" # emoticons
"U0001F300-U0001F5FF" # symbols & pictographs
"U0001F680-U0001F6FF" # transport & map symbols
"U0001F1E0-U0001F1FF" # flags (iOS)
"U00002702-U000027B0"
"U000024C2-U0001F251"
"]+",
flags=re.UNICODE,
)
return emoji_pattern.sub(r"", string)
Remove HTML
def remove_html_tags(input):
"""
returns text without html tagsstr
"""
soup = beautifulsoup(input, "html.parser")
stripped_input = soup.get_text(separator=" ")
return stripped_input
Tokenize into words
def tokenize(input):
"""
returns tokenized version of text
Parameters:
input(str): string which contains all the data
Returns:
Tokenized string (list)
"""
return word_tokenize(input)
Remove Stop Words
STOP_WORDS = set(stopwords.words("english"))
def remove_stop_words(input):
"""
Returns text without stop words
Parameters:
input(str): string which contains all the data
Returns:
Tokenized word list without stopwords(list)
"""
input = word_tokenize(input)
return [word for word in input if word not in STOP_WORDS]
Lemmatize
def lemmatize(input: str):
"""
Lemmatizes input using nltk's wordnetlemmatizer
Parameters:
input(str): string which contains all the data
Returns:
Lemmatized string(str)
"""
lemmatizer = wordnetlemmatizer()
input_str = word_tokenize(input)
new_words = []
for word in input_str:
new_words.append(lemmatizer.lemmatize(word))
return " ".join(new_words)
Total Pipeline
Putting all the modules defined abhove tokether.
def nlp_pipeline(input):
"""Function that calls all other functions together to perform nlp on a given text
Parameters:
input(str): string which contains all the data
Returns:
Cleaned string(str)
"""
return lemmatize(
" ".join(
remove_stop_words(
remove_emoji(
remove_whitespaces(
remove_punctuation(remove_html_tags(lowercase(input)))
)
)
)
)
)
Last updated