Data cleaning¶

Load package and data

import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

## customized package, see notebooks/Trie.py
from Trie import Trie, make_regex

## input an text example - a job post
filename = 'data/text_example.txt'
with open(filename, 'rt') as handler:
    text = handler.read()

## stop words and punctuations
stop_words = stopwords.words('english')
punc = string.punctuation

Clean text with regex¶

Clean special patterns

def clean_special_patterns(text):
    """Remove special patterns - email, url, date etc."""
    email_regex = re.compile(r"[\w.-]+@[\w.-]+")
    url_regex = re.compile(r"(http|www)[^\s]+")
    date_regex = re.compile(r"[\d]{2,4}[ -/:]*[\d]{2,4}([ -/:]*[\d]{2,4})?") # a way to match date
    ## remove
    text = url_regex.sub("", text)
    text = email_regex.sub("", text)
    text = date_regex.sub("", text)
    return text.strip

s = """Applications:
www.aa.frdfaunefehofer.de/defe/referfefenzenefe/afeda-cenfeter.html
http://www.ifefis.fe.com
email: fowjfoj@fwjofj.djfow
Kennziffer: IIS-2020-12-23
Bewerbungsfrist:"""

clean_special_patterns(s)

‘Applications: \n\n\nemail: \nKennziffer: IIS-\nBewerbungsfrist:\n’

Remove stopwords and punctions

def clean_stopwords(text):
    stop_regex = make_regex(stop_words)
    text = stop_regex.sub("", text)
    return text

def clean_punct(text):
    punc_regex = re.compile('[%s]'%re.escape(string.punctuation))
    text = punc_regex.sub("", text)
    return text

clean_stopword(text)
clean_punct(text)

About Trie data structure, check my other post.
The script Trie.py, which you can find in the notebooks/ folder in my repo, and original code is here.

Clean text with NLTK¶

Tokenize

tokens = word_tokenize(text)

Remove punctuations

words = [word.lower() for word in tokens if word.isalpha()]

Remove stop words

stop_words = stopwords.words('english')
words = [word for word in words if not word in stop_words]

Stemming

porter = PorterStemmer()
stemmed_words = [porter.stem(word) for word in words]

Text cleaning pipeline¶

def clean_text(text):
    """clean text by
    clean_special_patterns: email, date, url, etc.
    remove punctions, stop words
    stem words

    output
    --------
    list: stemmed words
    """
    s = clean_special_patterns(text)
    tokens = word_tokenize(text)
    words = [word.lower() for word in tokens if word.isalpha()]
    words = [word for word in words if not word in stop_words]
    stemmed_words = [porter.stem(word) for word in words]
    return stemmed_words

Text cleaning module¶

Build a cleaning module based on the above contents.

import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from Trie import make_regex

## stop words and punctuations
stop_words = stopwords.words('english')

## regex
email_regex = re.compile(r"[\w.-]+@[\w.-]+")
url_regex = re.compile(r"(http|www)[^\s]+")
date_regex = re.compile(r"[\d]{2,4}[ -/:]*[\d]{2,4}([ -/:]*[\d]{2,4})?") # a way to match date
keep_word_regex = re.compile(r"[^A-Za-z ]+")
stop_regex = make_regex(stop_words)

def clean_special_patterns(text):
    """Remove special patterns - email, url, date etc."""
    ## remove
    text = url_regex.sub("", text)
    text = email_regex.sub("", text)
    text = date_regex.sub("", text)
    return text

def clean_stopwords(text):
    text = stop_regex.sub("", text)
    return text

def clean_keep_words(text):
    return keep_word_regex.sub(" ", text)

def clean_text(text):
    text = clean_special_patterns(text)
    text = clean_stopwords(text)
    text = clean_keep_words(text)
    tokens = [word.lower() for word in word_tokenize(text)]
    return tokens

To be notice that, there is no universal text cleaning method. For some classification tasks, special characters might be good features, they should not be removed. For word2vec task, it is better not to stem the words and some stop words maybe important. For text generation, stop words might be also useful.