## Uncomment if you are missing packages
#! pip install numpy
#! pip install lxml
#! pip install nltk
#! pip install beautifulsoup4
#! pip install tensorflow


import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

    
import IPython
from IPython.display import display


# Natural Language Toolkit
# https://www.nltk.org/index.html
import nltk


nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('vader_lexicon')

[nltk_data] Downloading package punkt to /Users/mayer/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/mayer/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/mayer/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /Users/mayer/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/mayer/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!

True


#sentence = "Albrecht is from Munich"
#sentence = "Albrecht and Andreas are from Munich"
#sentence = "Albrecht and Sven are from Munich"
#sentence = "Sven is from Munich"
sentence = "Albrecht and Andreas are from the LMU Munich"
tokens = nltk.word_tokenize(sentence)
tokens

['Albrecht', 'and', 'Andreas', 'are', 'from', 'the', 'LMU', 'Munich']


tokens = nltk.word_tokenize(sentence) # Tokenizer
tag = nltk.pos_tag(tokens) # Tagging Words
ne_tree = nltk.ne_chunk(tag) # Named-Entity Recognition
#print(ne_tree)

IPython.core.display.display(ne_tree)


from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()

sentences = [
    'We are happy!',
    'Today, I am happy!',
    'You are so happy',
    'The best life ever',
    'I am sad',
    'We are sad',
    'We are super sad!',
    "We are all so sad today!"
]


for s in sentences:
    print(s, "   -   ", sid.polarity_scores(s))

We are happy!    -    {'neg': 0.0, 'neu': 0.334, 'pos': 0.666, 'compound': 0.6114}
Today, I am happy!    -    {'neg': 0.0, 'neu': 0.334, 'pos': 0.666, 'compound': 0.6114}
You are so happy    -    {'neg': 0.0, 'neu': 0.388, 'pos': 0.612, 'compound': 0.6948}
The best life ever    -    {'neg': 0.0, 'neu': 0.417, 'pos': 0.583, 'compound': 0.6369}
I am sad    -    {'neg': 0.756, 'neu': 0.244, 'pos': 0.0, 'compound': -0.4767}
We are sad    -    {'neg': 0.608, 'neu': 0.392, 'pos': 0.0, 'compound': -0.4767}
We are super sad!    -    {'neg': 0.334, 'neu': 0.215, 'pos': 0.451, 'compound': 0.2714}
We are all so sad today!    -    {'neg': 0.461, 'neu': 0.539, 'pos': 0.0, 'compound': -0.6466}


import numpy as np
import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


tokenizer = Tokenizer(num_words = 100, oov_token="##OOV##") # Out Of Vocabulary token
tokenizer.fit_on_texts(sentences)


tokenizer.word_index

{'##OOV##': 1,
 'are': 2,
 'we': 3,
 'sad': 4,
 'happy': 5,
 'today': 6,
 'i': 7,
 'am': 8,
 'so': 9,
 'you': 10,
 'the': 11,
 'best': 12,
 'life': 13,
 'ever': 14,
 'super': 15,
 'all': 16}


sequences = tokenizer.texts_to_sequences(sentences)
sequences

[[3, 2, 5],
 [6, 7, 8, 5],
 [10, 2, 9, 5],
 [11, 12, 13, 14],
 [7, 8, 4],
 [3, 2, 4],
 [3, 2, 15, 4],
 [3, 2, 16, 9, 4, 6]]


sentences_new = ["Hello IUI class!", "The IUI class is happy"]
sequences_new = tokenizer.texts_to_sequences(sentences_new)
print(sequences_new)

[[1, 1, 1], [11, 1, 1, 1, 5]]


tokenizer.sequences_to_texts(sequences_new)

['##OOV## ##OOV## ##OOV##', 'the ##OOV## ##OOV## ##OOV## happy']


sequences_padded = pad_sequences(sequences, maxlen=7, padding="post", truncating="post")
sequences_padded

array([[ 3,  2,  5,  0,  0,  0,  0],
       [ 6,  7,  8,  5,  0,  0,  0],
       [10,  2,  9,  5,  0,  0,  0],
       [11, 12, 13, 14,  0,  0,  0],
       [ 7,  8,  4,  0,  0,  0,  0],
       [ 3,  2,  4,  0,  0,  0,  0],
       [ 3,  2, 15,  4,  0,  0,  0],
       [ 3,  2, 16,  9,  4,  6,  0]], dtype=int32)


sequences

[[3, 2, 5],
 [6, 7, 8, 5],
 [10, 2, 9, 5],
 [11, 12, 13, 14],
 [7, 8, 4],
 [3, 2, 4],
 [3, 2, 15, 4],
 [3, 2, 16, 9, 4, 6]]


input_length = sequences_padded.shape[-1]


train_sequences = sequences_padded

train_classes = ["happy", "happy", "happy", "happy", "sad", "sad", "sad", "sad"]
class_dict =  {"happy":0, "sad":1}
train_labels = np.array([class_dict[c] for c in train_classes])
train_labels

array([0, 0, 0, 0, 1, 1, 1, 1])


train_sequences = sequences_padded
train_classes = np.array([1,1,1,1,0,0,0,0])
train_labels = tf.keras.utils.to_categorical(train_classes)
#train_labels


train_labels

array([[0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.]], dtype=float32)


train_sequences

array([[ 3,  2,  5,  0,  0,  0,  0],
       [ 6,  7,  8,  5,  0,  0,  0],
       [10,  2,  9,  5,  0,  0,  0],
       [11, 12, 13, 14,  0,  0,  0],
       [ 7,  8,  4,  0,  0,  0,  0],
       [ 3,  2,  4,  0,  0,  0,  0],
       [ 3,  2, 15,  4,  0,  0,  0],
       [ 3,  2, 16,  9,  4,  6,  0]], dtype=int32)


model = tf.keras.Sequential([
    tf.keras.layers.Input(input_length),
    #tf.keras.layers.Embedding(len(tokenizer.word_index)+1, 16),
    #tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(10, activation="relu"),
    tf.keras.layers.Dense(2)
])
model.compile(optimizer='adam',
              loss=tf.keras.losses.CategoricalCrossentropy(),
              metrics=['accuracy'])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
dense (Dense)                (None, 10)                80        
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 22        
=================================================================
Total params: 102
Trainable params: 102
Non-trainable params: 0
_________________________________________________________________


model.fit(train_sequences, train_labels, epochs=10)

Epoch 1/10
1/1 [==============================] - 0s 883us/step - loss: 3.1493 - accuracy: 0.5000
Epoch 2/10
1/1 [==============================] - 0s 1ms/step - loss: 3.0400 - accuracy: 0.5000
Epoch 3/10
1/1 [==============================] - 0s 1ms/step - loss: 2.9829 - accuracy: 0.5000
Epoch 4/10
1/1 [==============================] - 0s 1ms/step - loss: 2.9437 - accuracy: 0.5000
Epoch 5/10
1/1 [==============================] - 0s 1ms/step - loss: 2.9136 - accuracy: 0.5000
Epoch 6/10
1/1 [==============================] - 0s 1ms/step - loss: 2.8892 - accuracy: 0.5000
Epoch 7/10
1/1 [==============================] - 0s 1ms/step - loss: 2.8690 - accuracy: 0.5000
Epoch 8/10
1/1 [==============================] - 0s 916us/step - loss: 2.8519 - accuracy: 0.5000
Epoch 9/10
1/1 [==============================] - 0s 1ms/step - loss: 2.8367 - accuracy: 0.5000
Epoch 10/10
1/1 [==============================] - 0s 1ms/step - loss: 1.5507 - accuracy: 0.5000

<tensorflow.python.keras.callbacks.History at 0x7fbdcd2e58d0>


class_probability = model.predict(sequences_padded)
class_probability

array([[ 3.6603293 ,  0.1716539 ],
       [ 4.841127  ,  1.4886731 ],
       [ 4.1087847 ,  0.03218371],
       [ 5.449742  ,  2.9826021 ],
       [ 3.338635  ,  1.4783506 ],
       [ 2.9419942 ,  0.1674317 ],
       [ 8.972211  ,  2.447952  ],
       [12.668601  ,  1.4783134 ]], dtype=float32)


predicted_class = np.argmax(class_probability, axis=1)
predicted_class

array([0, 0, 0, 0, 0, 0, 0, 0])


import urllib.request
import bs4 as bs
import re
import nltk

scraped_data = urllib.request.urlopen('https://en.wikipedia.org/wiki/Severe_acute_respiratory_syndrome_coronavirus_2')
article = scraped_data.read()

parsed_article = bs.BeautifulSoup(article,'lxml')

paragraphs = parsed_article.find_all('p')

article_text = ""

for p in paragraphs:
    article_text += p.text
    
# Removing Square Brackets and Extra Spaces
article_text = re.sub(r'\[[0-9]*\]', ' ', article_text)
article_text = re.sub(r'\s+', ' ', article_text)

# Removing special characters and digits
formatted_article_text = re.sub('[^a-zA-Z]', ' ', article_text )
formatted_article_text = re.sub(r'\s+', ' ', formatted_article_text)

sentence_list = nltk.sent_tokenize(article_text)

stopwords = nltk.corpus.stopwords.words('english')

word_frequencies = {}
for word in nltk.word_tokenize(formatted_article_text):
    if word not in stopwords:
        if word not in word_frequencies.keys():
            word_frequencies[word] = 1
        else:
            word_frequencies[word] += 1
    maximum_frequncy = max(word_frequencies.values())
for word in word_frequencies.keys():
    word_frequencies[word] = (word_frequencies[word]/maximum_frequncy)
    sentence_scores = {}
for sent in sentence_list:
    for word in nltk.word_tokenize(sent.lower()):
        if word in word_frequencies.keys():
            if len(sent.split(' ')) < 30:
                if sent not in sentence_scores.keys():
                    sentence_scores[sent] = word_frequencies[word]
                else:
                    sentence_scores[sent] += word_frequencies[word]
import heapq
summary_sentences = heapq.nlargest(7, sentence_scores, key=sentence_scores.get)

summary = ' '.join(summary_sentences)
print(summary)

 Severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2) is the strain of coronavirus that causes coronavirus disease 2019 (COVID-19), the respiratory illness responsible for the COVID-19 pandemic. During the initial outbreak in Wuhan, China, various names were used for the virus; some names used by different sources included the "coronavirus" or "Wuhan coronavirus". Other studies have suggested that the virus may be airborne as well, with aerosols potentially being able to transmit the virus. Based on whole genome sequence similarity, a pangolin coronavirus candidate strain was found to be less similar than RaTG13, but more similar than other bat coronaviruses to SARS-CoV-2. This means each infection from the virus is expected to result in 5.7 new infections when no members of the community are immune and no preventive measures are taken. Studies have shown that SARS-CoV-2 has a higher affinity to human ACE2 than the original SARS virus strain. However, other research indicates that visitors may have introduced the virus to the market, which then facilitated rapid expansion of the infections.

Intelligent User Interfaces¶

Natural Language Processing¶

Tokenizer using NLTK¶

Named-Entity Recognition¶

Sentiment Analysis using Vader¶

Training a Sentiment Analysis using TensorFlow Example¶

Disclaimer for the above example¶

Text Summarization - example from:¶