← Back

Program 14: Using Python NLTK, perform the following Natural Language Processing (NLP) tasks for text content. a) Tokenizing b) Filtering Stop Words c) Stemming d) Part of Speech tagging e) Chunking f) Named Entity Recognition (NER)

Simple Python Code
try:
    import numpy
    import nltk
    from nltk.tokenize import word_tokenize
    from nltk.corpus import stopwords
    from nltk.stem import PorterStemmer
    from nltk import pos_tag, ne_chunk
    
    #nltk start
    nltk.download('punkt')
    nltk.download('punkt_tab')
    nltk.download('stopwords')
    nltk.download('averaged_perceptron_tagger')
    nltk.download('averaged_perceptron_tagger_eng')
    nltk.download('maxent_ne_chunker')
    nltk.download('maxent_ne_chunker_tab')
    nltk.download('words')
    #nltk end

except ImportError:
    print("NLTK is not installed. Run: pip install nltk numpy")
    exit()

text = "Natural Language Processing with Python is amazing. NLTK makes it easy!"

# a) Tokenization
print("a) Tokenization:")
tokens = word_tokenize(text)
print(tokens, "\n")

# b) Stopword Removal
print("b) Stopword Removal:")
stop_words = set(stopwords.words("english"))
filtered_words = [w for w in tokens if w.lower() not in stop_words]
print(filtered_words, "\n")

# c) Stemming
print("c) Stemming:")
stemmer = PorterStemmer()
print([stemmer.stem(w) for w in filtered_words], "\n")

# d) POS Tagging
print("d) POS Tagging:")
pos_tags = pos_tag(tokens)
print(pos_tags, "\n")

# e) Chunking
print("e) Chunking:")
grammar = "NP: {<DT>?<JJ>*<NN>}"
chunked = nltk.RegexpParser(grammar).parse(pos_tags)
print(chunked, "\n")

# f) Named Entity Recognition
print("f) Named Entity Recognition:")
try:
    ner = ne_chunk(pos_tags)
    print(ner)
except LookupError:
    print("NER data error.")
Advanced Python Code
try:
    import nltk
    from nltk.tokenize import word_tokenize
    from nltk.corpus import stopwords
    from nltk.stem import PorterStemmer
    from nltk import pos_tag, ne_chunk
except ImportError:
    print("NLTK is not installed. Run: pip install nltk")
    exit()

text = "Natural Language Processing with Python is amazing. NLTK makes it easy!"

# a) Tokenization
print("a) Tokenization:")
tokens = word_tokenize(text)
print(tokens, "\n")

# b) Stopword Removal
print("b) Stopword Removal:")
stop_words = set(stopwords.words("english"))
filtered_words = [w for w in tokens if w.lower() not in stop_words]
print(filtered_words, "\n")

# c) Stemming
print("c) Stemming:")
stemmer = PorterStemmer()
print([stemmer.stem(w) for w in filtered_words], "\n")

# d) POS Tagging
print("d) POS Tagging:")
pos_tags = pos_tag(tokens)
print(pos_tags, "\n")

# e) Chunking
print("e) Chunking:")
grammar = "NP: {<DT>?<JJ>*<NN>}"
chunked = nltk.RegexpParser(grammar).parse(pos_tags)
print(chunked, "\n")

# f) Named Entity Recognition
print("f) Named Entity Recognition:")
try:
    ner = ne_chunk(pos_tags)
    print(ner)
except LookupError:
    print("NER data error.")
Infographics