NLP Cleaning Pipeline

This post contains my most used snippets in my Natural Language adventures.

Imports

import re
import os
from bs4 import BeautifulSoup as beautifulsoup
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer as wordnetlemmatizer

Lowercase

def lowercase(input):
    """
    returns lowercase text
    """
    return input.lower()

Remove Punctuation

def remove_punctuation(input):
    """
    returns text without punctuation
    """
    return input.translate(str.maketrans("", "", string.punctuation + "1234567890"))

Remove White space

def remove_whitespaces(input):
    """
    returns text without extra whitespaces
    """
    return " ".join(input.split())

Remove Emoji

def remove_emoji(string):
    """
    Remove the weird Emojis from the corpus
    """
    emoji_pattern = re.compile(
        "["
        "U0001F600-U0001F64F"  # emoticons
        "U0001F300-U0001F5FF"  # symbols & pictographs
        "U0001F680-U0001F6FF"  # transport & map symbols
        "U0001F1E0-U0001F1FF"  # flags (iOS)
        "U00002702-U000027B0"
        "U000024C2-U0001F251"
        "]+",
        flags=re.UNICODE,
    )
    return emoji_pattern.sub(r"", string)

Remove HTML

def remove_html_tags(input):
    """
    returns text without html tagsstr
    """
    soup = beautifulsoup(input, "html.parser")
    stripped_input = soup.get_text(separator=" ")
    return stripped_input

Tokenize into words

def tokenize(input):
    """
    returns tokenized version of text

    Parameters:
        input(str): string which contains all the data

    Returns:
        Tokenized string (list)
    """
    return word_tokenize(input)

Remove Stop Words

STOP_WORDS = set(stopwords.words("english"))

def remove_stop_words(input):
    """
    Returns text without stop words

    Parameters:
        input(str): string which contains all the data

    Returns:
        Tokenized word list without stopwords(list)
    """
    input = word_tokenize(input)
    return [word for word in input if word not in STOP_WORDS]

Lemmatize

def lemmatize(input: str):
    """
    Lemmatizes input using nltk's wordnetlemmatizer

    Parameters:
        input(str): string which contains all the data

    Returns:
        Lemmatized string(str)
    """
    lemmatizer = wordnetlemmatizer()
    input_str = word_tokenize(input)
    new_words = []
    for word in input_str:
        new_words.append(lemmatizer.lemmatize(word))
    return " ".join(new_words)

Total Pipeline

Putting all the modules defined abhove tokether.

def nlp_pipeline(input):
    """Function that calls all other functions together to perform nlp on a given text

    Parameters:
        input(str): string which contains all the data

    Returns:
        Cleaned string(str)
    """
    return lemmatize(
        " ".join(
            remove_stop_words(
                remove_emoji(
                    remove_whitespaces(
                        remove_punctuation(remove_html_tags(lowercase(input)))
                    )
                )
            )
        )
    )

PreviousGPT OpenAI

Last updated 1 year ago

import re import os from bs4 import BeautifulSoup as beautifulsoup import string from nltk.tokenize import word_tokenize from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer as wordnetlemmatizer

def remove_emoji(string): """ Remove the weird Emojis from the corpus """ emoji_pattern = re.compile( "[" "U0001F600-U0001F64F" # emoticons "U0001F300-U0001F5FF" # symbols & pictographs "U0001F680-U0001F6FF" # transport & map symbols "U0001F1E0-U0001F1FF" # flags (iOS) "U00002702-U000027B0" "U000024C2-U0001F251" "]+", flags=re.UNICODE, ) return emoji_pattern.sub(r"", string)

def remove_html_tags(input): """ returns text without html tagsstr """ soup = beautifulsoup(input, "html.parser") stripped_input = soup.get_text(separator=" ") return stripped_input

def tokenize(input): """ returns tokenized version of text Parameters: input(str): string which contains all the data Returns: Tokenized string (list) """ return word_tokenize(input)

STOP_WORDS = set(stopwords.words("english")) def remove_stop_words(input): """ Returns text without stop words Parameters: input(str): string which contains all the data Returns: Tokenized word list without stopwords(list) """ input = word_tokenize(input) return [word for word in input if word not in STOP_WORDS]

def lemmatize(input: str): """ Lemmatizes input using nltk's wordnetlemmatizer Parameters: input(str): string which contains all the data Returns: Lemmatized string(str) """ lemmatizer = wordnetlemmatizer() input_str = word_tokenize(input) new_words = [] for word in input_str: new_words.append(lemmatizer.lemmatize(word)) return " ".join(new_words)

def nlp_pipeline(input): """Function that calls all other functions together to perform nlp on a given text Parameters: input(str): string which contains all the data Returns: Cleaned string(str) """ return lemmatize( " ".join( remove_stop_words( remove_emoji( remove_whitespaces( remove_punctuation(remove_html_tags(lowercase(input))) ) ) ) ) )