PART A

Build a sequential NLP classifier which can use input text parameters to determine the customer sentiments Using Bidirectional LSTM

Importing libraries and setup

import tensorflow as tf

import pandas as pd

import numpy as np
import matplotlib.pyplot as plt
from keras.datasets import imdb
from keras.preprocessing import sequence
from keras.utils import to_categorical
from keras import models
from keras import layers
import string
from keras.utils import pad_sequences
from keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense

tf.config.list_physical_devices()

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

pd.set_option('display.expand_frame_repr', False)

Q1. Import and analyse the data set

Importing the dataset from keras.datasets and instantiating the top 10000 words

most_frequent_words = 10000
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=most_frequent_words)

x_train.shape, x_test.shape, y_train.shape, y_test.shape

((25000,), (25000,), (25000,), (25000,))

Q2. Perform relevant sequence adding on the data

We’ll vectorize using one hot encoding and create a feature for the one hot encoded strings

def one_hot_encode(sequences, dimension = 10000):
    results = np.zeros((len(sequences), dimension))
    for i, sequence in enumerate(sequences):
        results[i, sequence] = 1
    return results

whole_data = np.concatenate((x_train, x_test), axis=0)
label = np.concatenate((y_train, y_test), axis=0)

print("Categories:", np.unique(label))
print("Number of unique words:", len(np.unique(np.hstack(whole_data))))

Categories: [0 1]
Number of unique words: 9998

length = [len(i) for i in whole_data]
print("Average Review length:", np.mean(length))
print("Standard Deviation:", round(np.std(length)))

Average Review length: 234.75892
Standard Deviation: 173

feature_ohe = one_hot_encode(whole_data)
label = np.array(label).astype("float32")

Q3. Perform following data analysis

Print shape of features and labels
Print value of any one feature and it’s label

whole_data.shape, label.shape

((50000,), (50000,))

print(f'Feature first = {feature_ohe} \n')
print(f'label first = {label[0]}')

Feature first = [[0. 1. 1. ... 0. 0. 0.]
 [0. 1. 1. ... 0. 0. 0.]
 [0. 1. 1. ... 0. 0. 0.]
 ...
 [0. 1. 1. ... 0. 0. 0.]
 [0. 1. 1. ... 0. 0. 0.]
 [0. 1. 1. ... 0. 0. 0.]] 

label first = 1.0

Q4. Decode the feature value to get original sentence

len(x_train)
len(x_train[0])

vocab_index = imdb.get_word_index()
vocab_index = { key:(value + 3) for key, value in vocab_index.items() }
vocab_index[''] = 0                                                    # Padding
vocab_index['>'] = 1                                                   # Start
vocab_index['?'] = 2                                                   # Unknown word
reverse_word_dict = { value:key for key, value in vocab_index.items() }

reverse_vocab_index = dict([(value, key) for (key, value) in vocab_index.items()]) 

' '.join(reverse_word_dict[id] for id in x_train[2])

"> this has to be one of the worst films of the 1990s when my friends i were watching this film being the target audience it was aimed at we just sat watched the first half an hour with our jaws touching the floor at how bad it really was the rest of the time everyone else in the theatre just started talking to each other leaving or generally crying into their popcorn that they actually paid money they had ? working to watch this feeble excuse for a film it must have looked like a great idea on paper but on film it looks like no one in the film has a clue what is going on crap acting crap costumes i can't get across how ? this is to watch save yourself an hour a bit of your life"

Q5. Design, train, tune and test a sequential model

max_review_length = 926
x_train = pad_sequences(x_train, maxlen=max_review_length)
x_test = pad_sequences(x_test, maxlen=max_review_length)

x_train.shape, x_test.shape

((25000, 926), (25000, 926))

%%time
embedding_vector_length = 32
model = Sequential()
model.add(Embedding(most_frequent_words, embedding_vector_length, input_length=max_review_length))
model.add(Flatten())
model.add(Dense(16, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 embedding_2 (Embedding)     (None, 926, 32)           320000    
                                                                 
 flatten_1 (Flatten)         (None, 29632)             0         
                                                                 
 dense_4 (Dense)             (None, 16)                474128    
                                                                 
 dense_5 (Dense)             (None, 16)                272       
                                                                 
 dense_6 (Dense)             (None, 1)                 17        
                                                                 
=================================================================
Total params: 794,417
Trainable params: 794,417
Non-trainable params: 0
_________________________________________________________________
None
CPU times: user 50.5 ms, sys: 36.4 ms, total: 87 ms
Wall time: 111 ms

callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)

model.compile(
 optimizer = "adam",
 loss = "binary_crossentropy",
 metrics = ["accuracy"]
)

with tf.device('/device:GPU:0'):
    results = model.fit(
        x_train, y_train,
        epochs= 10,
        batch_size = 128,
        validation_data = (x_test, y_test),
        callbacks=[callback],
        verbose=0
)

_, train_acc = model.evaluate(x_train, y_train, verbose=1)
_, test_acc = model.evaluate(x_test, y_test, verbose=1)
print('Train: %.3f, Test: %.3f' % (train_acc, test_acc))

782/782 [==============================] - 5s 6ms/step - loss: 8.7406e-05 - accuracy: 1.0000
782/782 [==============================] - 5s 6ms/step - loss: 0.6214 - accuracy: 0.8732
Train: 1.000, Test: 0.873

scores = model.evaluate(x_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1] * 100))

Accuracy: 87.32%

Q6. Use the designed model to print the prediction on any one sample

preds = model.predict(x_test)

782/782 [==============================] - 5s 6ms/step

preds[0]

array([2.9718738e-05], dtype=float32)

def predict_sentiment(text):
    # Prepare the input by removing punctuation characters, converting
    # characters to lower case, and removing words containing numbers
    translator = str.maketrans('', '', string.punctuation)
    text = text.translate(translator)
    text = text.lower().split(' ')
    text = [word for word in text if word.isalpha()]

    # Generate an input tensor
    input = [1]
    for word in text:
        if word in vocab_index and vocab_index[word] < most_frequent_words:
            input.append(vocab_index[word])
        else:
            input.append(2)
    padded_input = pad_sequences([input], maxlen=926)

    # Invoke the model and return the result
    result = model.predict(np.array([padded_input][0]))[0][0]
    return result

predict_sentiment('Undoubtedly the most stellar experience I have ever watched.')

1/1 [==============================] - 0s 25ms/step

0.9829639

predict_sentiment('I had a really bad experience with the customer executive.')

1/1 [==============================] - 0s 17ms/step

0.1035999

PART B

Importing libraries

import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Flatten, Bidirectional, GlobalMaxPool1D
from tensorflow.keras.models import Model, Sequential
from zipfile import ZipFile
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

Q1. Read and explore the data

sarcasm_df = pd.read_json('Sarcasm_Headlines_Dataset.json',lines=True)

sarcasm_df.head(10)

	is_sarcastic	headline	article_link
0	1	thirtysomething scientists unveil doomsday clo...	https://www.theonion.com/thirtysomething-scien...
1	0	dem rep. totally nails why congress is falling...	https://www.huffingtonpost.com/entry/donna-edw...
2	0	eat your veggies: 9 deliciously different recipes	https://www.huffingtonpost.com/entry/eat-your-...
3	1	inclement weather prevents liar from getting t...	https://local.theonion.com/inclement-weather-p...
4	1	mother comes pretty close to using word 'strea...	https://www.theonion.com/mother-comes-pretty-c...
5	0	my white inheritance	https://www.huffingtonpost.com/entry/my-white-...
6	0	5 ways to file your taxes with less stress	https://www.huffingtonpost.com/entry/5-ways-to...
7	1	richard branson's global-warming donation near...	https://www.theonion.com/richard-bransons-glob...
8	1	shadow government getting too large to meet in...	https://politics.theonion.com/shadow-governmen...
9	0	lots of parents know this scenario	https://www.huffingtonpost.comhttp://pubx.co/6...

sarcasm_df.shape

(28619, 3)

Looking at statistical summary for the dataframe

sarcasm_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28619 entries, 0 to 28618
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   is_sarcastic  28619 non-null  int64 
 1   headline      28619 non-null  object
 2   article_link  28619 non-null  object
dtypes: int64(1), object(2)
memory usage: 670.9+ KB

sarcasm_df.describe()

	is_sarcastic
count	28619.000000
mean	0.476397
std	0.499451
min	0.000000
25%	0.000000
50%	0.000000
75%	1.000000
max	1.000000

sarcasm_df.is_sarcastic.value_counts()

is_sarcastic
0    14985
1    13634
Name: count, dtype: int64

The data looks almost perfectly balanced. Hence, we don’t need to perform any balancing of data.

Let’s Look at the first headline

sarcasm_df.loc[0]['headline']

'thirtysomething scientists unveil doomsday clock of hair loss'

Q2. Retain relevant columns

For the current project, column article_link is not relevant. For other capstone projects, we can maybe scrape data on the whole article and perform detailed analysis, but this is out of the scope for this.

sarcasm_relevant_df = sarcasm_df.drop('article_link',axis=1)

Q3. Get length of each sentence

length_each_sentence = [len(text) for text in sarcasm_relevant_df['headline']]

Let’s look at the maximum length from the headline column

max_length_sentence = max(length_each_sentence)
max_length_sentence

Q4. Define parameters

max_features = 10000
maxlen = max_length_sentence
embedding_size = 200
output_dim = 200

Q5. Get indices for words

We’ll now tokenize the word, and get the index for the word using the tokenizer library.

tokenizer = Tokenizer(num_words=max_features,filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',lower=True,split=' ', char_level=False)
tokenizer.fit_on_texts(sarcasm_relevant_df['headline'])

tokenizer.word_index

{'to': 1,
 'of': 2,
 'the': 3,
 'in': 4,
 'for': 5,
 'a': 6,
 'on': 7,
 'and': 8,
 'with': 9,
 'is': 10,
 'new': 11,
 'trump': 12,
 'man': 13,
 'at': 14,
 'from': 15,
 'about': 16,
 'by': 17,
 'after': 18,
 'you': 19,
 'this': 20,
 'out': 21,
 'up': 22,
 'be': 23,
 'as': 24,
 'that': 25,
 'it': 26,
 'how': 27,
 'not': 28,
 'he': 29,
 'his': 30,
 'are': 31,
 'your': 32,
 'just': 33,
 'what': 34,
 'all': 35,
 'who': 36,
 'has': 37,
 'will': 38,
 'report': 39,
 'into': 40,
 'more': 41,
 'one': 42,
 'have': 43,
 'year': 44,
 'over': 45,
 'why': 46,
 'day': 47,
 'u': 48,
 'area': 49,
 'woman': 50,
 'can': 51,
 's': 52,
 'says': 53,
 'donald': 54,
 'time': 55,
 'first': 56,
 'like': 57,
 'no': 58,
 'her': 59,
 'get': 60,
 'off': 61,
 'old': 62,
 "trump's": 63,
 'life': 64,
 'now': 65,
 'people': 66,
 "'": 67,
 'an': 68,
 'house': 69,
 'still': 70,
 'obama': 71,
 'white': 72,
 'back': 73,
 'make': 74,
 'was': 75,
 'than': 76,
 'women': 77,
 'if': 78,
 'down': 79,
 'when': 80,
 'i': 81,
 'my': 82,
 '5': 83,
 'clinton': 84,
 'could': 85,
 'they': 86,
 'their': 87,
 'before': 88,
 'world': 89,
 'him': 90,
 'way': 91,
 'americans': 92,
 'family': 93,
 'we': 94,
 'study': 95,
 'do': 96,
 'would': 97,
 'only': 98,
 'most': 99,
 'school': 100,
 'gop': 101,
 'being': 102,
 'black': 103,
 'years': 104,
 'bill': 105,
 "it's": 106,
 'so': 107,
 'finds': 108,
 'really': 109,
 'american': 110,
 'best': 111,
 '3': 112,
 'last': 113,
 'know': 114,
 'but': 115,
 'she': 116,
 'should': 117,
 'police': 118,
 'nation': 119,
 "can't": 120,
 '10': 121,
 'going': 122,
 'watch': 123,
 'during': 124,
 'state': 125,
 'death': 126,
 'video': 127,
 'or': 128,
 'home': 129,
 'president': 130,
 'good': 131,
 'every': 132,
 'say': 133,
 'show': 134,
 'campaign': 135,
 "'the": 136,
 'health': 137,
 'too': 138,
 'big': 139,
 'things': 140,
 'mom': 141,
 'getting': 142,
 '2': 143,
 'against': 144,
 'may': 145,
 '000': 146,
 'right': 147,
 'hillary': 148,
 'love': 149,
 'party': 150,
 'work': 151,
 'gets': 152,
 'while': 153,
 'some': 154,
 'self': 155,
 'need': 156,
 'high': 157,
 'little': 158,
 'parents': 159,
 'where': 160,
 'never': 161,
 'take': 162,
 'kids': 163,
 'through': 164,
 "doesn't": 165,
 'court': 166,
 'makes': 167,
 'john': 168,
 'child': 169,
 'change': 170,
 'these': 171,
 'other': 172,
 'own': 173,
 'news': 174,
 'calls': 175,
 'dead': 176,
 'stop': 177,
 'look': 178,
 "he's": 179,
 'want': 180,
 'election': 181,
 'next': 182,
 'our': 183,
 'gay': 184,
 'local': 185,
 "don't": 186,
 'even': 187,
 '4': 188,
 'see': 189,
 'go': 190,
 'around': 191,
 'america': 192,
 'real': 193,
 'takes': 194,
 'war': 195,
 'its': 196,
 '7': 197,
 "here's": 198,
 'baby': 199,
 'sex': 200,
 "nation's": 201,
 'them': 202,
 'million': 203,
 'again': 204,
 'plan': 205,
 'bush': 206,
 'made': 207,
 'two': 208,
 'another': 209,
 '6': 210,
 'guy': 211,
 'college': 212,
 'dog': 213,
 'office': 214,
 'announces': 215,
 'dad': 216,
 'ever': 217,
 'finally': 218,
 'debate': 219,
 'got': 220,
 'week': 221,
 'wants': 222,
 'been': 223,
 'long': 224,
 'help': 225,
 'much': 226,
 'job': 227,
 '1': 228,
 'thing': 229,
 'under': 230,
 'there': 231,
 'gun': 232,
 'reveals': 233,
 'night': 234,
 'care': 235,
 'actually': 236,
 'couple': 237,
 'congress': 238,
 'live': 239,
 'us': 240,
 'north': 241,
 'sexual': 242,
 'trying': 243,
 'money': 244,
 'national': 245,
 'shows': 246,
 'god': 247,
 "man's": 248,
 'climate': 249,
 'senate': 250,
 'better': 251,
 "won't": 252,
 'star': 253,
 'face': 254,
 'without': 255,
 '8': 256,
 'had': 257,
 'away': 258,
 'food': 259,
 'everyone': 260,
 '9': 261,
 'game': 262,
 'season': 263,
 'enough': 264,
 'facebook': 265,
 'anti': 266,
 'give': 267,
 'top': 268,
 '20': 269,
 'media': 270,
 'paul': 271,
 'making': 272,
 'any': 273,
 'law': 274,
 'me': 275,
 'bad': 276,
 'teen': 277,
 'shooting': 278,
 'ways': 279,
 'york': 280,
 'end': 281,
 'supreme': 282,
 'movie': 283,
 'free': 284,
 'entire': 285,
 'men': 286,
 'history': 287,
 'students': 288,
 'children': 289,
 'pope': 290,
 'government': 291,
 'single': 292,
 'introduces': 293,
 'business': 294,
 'tell': 295,
 'part': 296,
 'body': 297,
 'attack': 298,
 'fight': 299,
 'already': 300,
 'think': 301,
 'city': 302,
 'tv': 303,
 'story': 304,
 'friends': 305,
 'son': 306,
 'deal': 307,
 'fire': 308,
 'same': 309,
 'friend': 310,
 'releases': 311,
 'great': 312,
 'must': 313,
 'sanders': 314,
 'line': 315,
 'find': 316,
 'found': 317,
 '11': 318,
 'book': 319,
 'call': 320,
 'pretty': 321,
 'does': 322,
 'former': 323,
 'second': 324,
 'car': 325,
 'film': 326,
 'company': 327,
 'having': 328,
 'come': 329,
 'unveils': 330,
 'use': 331,
 'public': 332,
 'support': 333,
 'speech': 334,
 'social': 335,
 'wedding': 336,
 'power': 337,
 'presidential': 338,
 "didn't": 339,
 'middle': 340,
 'keep': 341,
 'behind': 342,
 'run': 343,
 'name': 344,
 'case': 345,
 'talk': 346,
 'doing': 347,
 'republican': 348,
 'open': 349,
 'coming': 350,
 'girl': 351,
 'photos': 352,
 'fans': 353,
 'scientists': 354,
 'room': 355,
 'looking': 356,
 'security': 357,
 'between': 358,
 'human': 359,
 'something': 360,
 'morning': 361,
 'full': 362,
 'thinks': 363,
 'fucking': 364,
 'james': 365,
 'voters': 366,
 'rights': 367,
 'asks': 368,
 'republicans': 369,
 'once': 370,
 'might': 371,
 'claims': 372,
 'future': 373,
 'used': 374,
 'christmas': 375,
 'tax': 376,
 'email': 377,
 'ceo': 378,
 'student': 379,
 'win': 380,
 'forced': 381,
 'admits': 382,
 'goes': 383,
 'group': 384,
 'secret': 385,
 'vote': 386,
 '2016': 387,
 "world's": 388,
 'michael': 389,
 'marriage': 390,
 'because': 391,
 'violence': 392,
 'democrats': 393,
 'poll': 394,
 'killed': 395,
 'team': 396,
 'control': 397,
 'ad': 398,
 'country': 399,
 '12': 400,
 'sure': 401,
 'department': 402,
 'plans': 403,
 'female': 404,
 'ban': 405,
 'many': 406,
 'bernie': 407,
 'teacher': 408,
 'inside': 409,
 'person': 410,
 'post': 411,
 'until': 412,
 'wife': 413,
 'ryan': 414,
 'put': 415,
 'always': 416,
 'twitter': 417,
 'super': 418,
 'political': 419,
 'water': 420,
 'running': 421,
 'hot': 422,
 'dies': 423,
 'meet': 424,
 'father': 425,
 'warns': 426,
 'each': 427,
 'head': 428,
 'photo': 429,
 '30': 430,
 'eating': 431,
 'boy': 432,
 'reports': 433,
 'minutes': 434,
 'race': 435,
 'judge': 436,
 'red': 437,
 'days': 438,
 'music': 439,
 'employee': 440,
 'let': 441,
 'perfect': 442,
 'record': 443,
 'tells': 444,
 'taking': 445,
 'candidate': 446,
 'past': 447,
 'everything': 448,
 'art': 449,
 'idea': 450,
 'living': 451,
 'list': 452,
 'class': 453,
 'summer': 454,
 'month': 455,
 'three': 456,
 'missing': 457,
 'wall': 458,
 'were': 459,
 '15': 460,
 'working': 461,
 'did': 462,
 'here': 463,
 'needs': 464,
 'states': 465,
 'secretary': 466,
 'very': 467,
 'looks': 468,
 'save': 469,
 'thousands': 470,
 'mother': 471,
 'service': 472,
 'town': 473,
 'mike': 474,
 'pay': 475,
 'russia': 476,
 'george': 477,
 'times': 478,
 'left': 479,
 'shot': 480,
 'thought': 481,
 'california': 482,
 'phone': 483,
 "'i": 484,
 'heart': 485,
 'start': 486,
 'lives': 487,
 'hours': 488,
 'set': 489,
 'place': 490,
 'age': 491,
 'wrong': 492,
 'gives': 493,
 'comes': 494,
 'together': 495,
 'cruz': 496,
 'meeting': 497,
 'shit': 498,
 'ready': 499,
 'officials': 500,
 'justice': 501,
 'ice': 502,
 'cancer': 503,
 'believe': 504,
 "you're": 505,
 'obamacare': 506,
 'young': 507,
 'probably': 508,
 'texas': 509,
 'someone': 510,
 'talks': 511,
 'wearing': 512,
 'half': 513,
 'breaking': 514,
 '50': 515,
 'giving': 516,
 'street': 517,
 'kill': 518,
 'lost': 519,
 'yet': 520,
 'ex': 521,
 'king': 522,
 'chief': 523,
 'korea': 524,
 'watching': 525,
 'few': 526,
 'kim': 527,
 'small': 528,
 'drug': 529,
 'air': 530,
 'owner': 531,
 'prison': 532,
 "women's": 533,
 'iran': 534,
 'daughter': 535,
 'leave': 536,
 'isis': 537,
 'restaurant': 538,
 'fbi': 539,
 'today': 540,
 'dream': 541,
 'fan': 542,
 'feel': 543,
 'ted': 544,
 'crisis': 545,
 'sleep': 546,
 'letter': 547,
 'mark': 548,
 'word': 549,
 'cat': 550,
 "i'm": 551,
 'wins': 552,
 'administration': 553,
 'south': 554,
 'director': 555,
 "she's": 556,
 'biden': 557,
 'hard': 558,
 'earth': 559,
 'hour': 560,
 'tips': 561,
 'education': 562,
 'washington': 563,
 'third': 564,
 'military': 565,
 'personal': 566,
 'community': 567,
 'nothing': 568,
 'attacks': 569,
 'outside': 570,
 'using': 571,
 'less': 572,
 'chris': 573,
 'talking': 574,
 'democratic': 575,
 'bar': 576,
 'system': 577,
 'rock': 578,
 'internet': 579,
 'questions': 580,
 'francis': 581,
 'nuclear': 582,
 'months': 583,
 'leaves': 584,
 'kind': 585,
 'federal': 586,
 'romney': 587,
 'those': 588,
 'move': 589,
 't': 590,
 'well': 591,
 'following': 592,
 'percent': 593,
 'online': 594,
 'latest': 595,
 'majority': 596,
 'order': 597,
 'birthday': 598,
 'issues': 599,
 'march': 600,
 'tweets': 601,
 'girlfriend': 602,
 'fun': 603,
 'fox': 604,
 "what's": 605,
 'assault': 606,
 'since': 607,
 'excited': 608,
 'gift': 609,
 'investigation': 610,
 'knows': 611,
 'congressman': 612,
 'abortion': 613,
 'lot': 614,
 'florida': 615,
 'series': 616,
 'buy': 617,
 'store': 618,
 'straight': 619,
 'minute': 620,
 'rules': 621,
 'guide': 622,
 'hit': 623,
 'read': 624,
 'special': 625,
 'cover': 626,
 'mueller': 627,
 'beautiful': 628,
 'holiday': 629,
 'waiting': 630,
 'happy': 631,
 'spends': 632,
 '100': 633,
 'stephen': 634,
 'called': 635,
 'reason': 636,
 'huge': 637,
 'travel': 638,
 'muslim': 639,
 'trip': 640,
 'ask': 641,
 'problem': 642,
 'offers': 643,
 'kid': 644,
 'leaders': 645,
 'different': 646,
 'told': 647,
 "isn't": 648,
 'whole': 649,
 'visit': 650,
 'russian': 651,
 'break': 652,
 'scott': 653,
 'hollywood': 654,
 'front': 655,
 'favorite': 656,
 'rise': 657,
 'relationship': 658,
 'worried': 659,
 'hair': 660,
 'david': 661,
 'cop': 662,
 'non': 663,
 'chinese': 664,
 'millions': 665,
 'thinking': 666,
 'trailer': 667,
 '2015': 668,
 'box': 669,
 'celebrates': 670,
 'girls': 671,
 'anything': 672,
 'play': 673,
 'date': 674,
 'immigration': 675,
 'hate': 676,
 'protest': 677,
 'al': 678,
 "america's": 679,
 'late': 680,
 'early': 681,
 'career': 682,
 'die': 683,
 'response': 684,
 'himself': 685,
 "obama's": 686,
 'taylor': 687,
 'union': 688,
 'stars': 689,
 'china': 690,
 'drunk': 691,
 'fall': 692,
 'birth': 693,
 'struggling': 694,
 'reasons': 695,
 'billion': 696,
 'united': 697,
 'message': 698,
 'massive': 699,
 'weekend': 700,
 '40': 701,
 'candidates': 702,
 'politics': 703,
 'accused': 704,
 'opens': 705,
 'become': 706,
 'least': 707,
 'starting': 708,
 'killing': 709,
 'huffpost': 710,
 'senator': 711,
 'bring': 712,
 'hands': 713,
 'mass': 714,
 'feels': 715,
 'interview': 716,
 'vows': 717,
 'hope': 718,
 'victims': 719,
 'experts': 720,
 'light': 721,
 'moment': 722,
 'discover': 723,
 'turn': 724,
 'returns': 725,
 'leader': 726,
 'words': 727,
 'pence': 728,
 'point': 729,
 'c': 730,
 'jimmy': 731,
 'clearly': 732,
 'turns': 733,
 'far': 734,
 'driving': 735,
 'center': 736,
 'dating': 737,
 'key': 738,
 'employees': 739,
 'sports': 740,
 'policy': 741,
 'lessons': 742,
 'tom': 743,
 'sick': 744,
 'wishes': 745,
 'apple': 746,
 'conversation': 747,
 'host': 748,
 'sign': 749,
 'whether': 750,
 "they're": 751,
 'completely': 752,
 'murder': 753,
 'fashion': 754,
 'adds': 755,
 'adorable': 756,
 'totally': 757,
 'learned': 758,
 'signs': 759,
 'hoping': 760,
 'prince': 761,
 'fuck': 762,
 'role': 763,
 'stage': 764,
 'j': 765,
 'oil': 766,
 'powerful': 767,
 'k': 768,
 'breaks': 769,
 'abuse': 770,
 'song': 771,
 'across': 772,
 'reality': 773,
 'moving': 774,
 'decision': 775,
 'global': 776,
 'keeps': 777,
 'experience': 778,
 'seen': 779,
 'syrian': 780,
 'announce': 781,
 'true': 782,
 'risk': 783,
 'iraq': 784,
 'joe': 785,
 'bus': 786,
 'begins': 787,
 'jr': 788,
 'syria': 789,
 'hand': 790,
 'puts': 791,
 'dance': 792,
 '13': 793,
 'final': 794,
 'check': 795,
 'cops': 796,
 'dinner': 797,
 'stand': 798,
 'apartment': 799,
 'cut': 800,
 'almost': 801,
 'playing': 802,
 'names': 803,
 'coffee': 804,
 'weird': 805,
 'longer': 806,
 'hurricane': 807,
 'schools': 808,
 'kills': 809,
 "there's": 810,
 'lose': 811,
 'amazon': 812,
 'awards': 813,
 'un': 814,
 'low': 815,
 'press': 816,
 'robert': 817,
 'mind': 818,
 'number': 819,
 'lead': 820,
 'west': 821,
 'd': 822,
 'worth': 823,
 'anniversary': 824,
 'surprise': 825,
 'oscar': 826,
 'feeling': 827,
 'anyone': 828,
 'crash': 829,
 'mental': 830,
 'trans': 831,
 'test': 832,
 'lgbt': 833,
 'worst': 834,
 'band': 835,
 'hall': 836,
 'space': 837,
 'return': 838,
 'official': 839,
 'which': 840,
 'audience': 841,
 'queer': 842,
 'shop': 843,
 'hits': 844,
 'demands': 845,
 'oscars': 846,
 'data': 847,
 'workers': 848,
 'university': 849,
 'iowa': 850,
 'side': 851,
 'apologizes': 852,
 'road': 853,
 'cool': 854,
 'industry': 855,
 'suspect': 856,
 'planned': 857,
 'plane': 858,
 'nfl': 859,
 'evidence': 860,
 'steve': 861,
 'eat': 862,
 'program': 863,
 'door': 864,
 'table': 865,
 'governor': 866,
 'remember': 867,
 'try': 868,
 'important': 869,
 'chance': 870,
 'transgender': 871,
 'defense': 872,
 'reportedly': 873,
 'urges': 874,
 'near': 875,
 'church': 876,
 'chicago': 877,
 'halloween': 878,
 'users': 879,
 'possible': 880,
 'general': 881,
 'doctor': 882,
 '2014': 883,
 'hear': 884,
 'style': 885,
 'spot': 886,
 'supporters': 887,
 'five': 888,
 'rubio': 889,
 'reveal': 890,
 'kardashian': 891,
 'advice': 892,
 'coworker': 893,
 'given': 894,
 'voter': 895,
 'brings': 896,
 'picture': 897,
 'dying': 898,
 'success': 899,
 'blood': 900,
 'executive': 901,
 'finding': 902,
 'suicide': 903,
 'tour': 904,
 'biggest': 905,
 '2017': 906,
 'allegations': 907,
 'accidentally': 908,
 'easy': 909,
 'boys': 910,
 'football': 911,
 'protesters': 912,
 'bathroom': 913,
 'fighting': 914,
 'demand': 915,
 'michelle': 916,
 'peace': 917,
 'push': 918,
 'reform': 919,
 'paris': 920,
 'close': 921,
 'avoid': 922,
 'elizabeth': 923,
 'israel': 924,
 'homeless': 925,
 'poor': 926,
 'fear': 927,
 'cnn': 928,
 'quietly': 929,
 'reminds': 930,
 'teens': 931,
 'nyc': 932,
 'address': 933,
 'learn': 934,
 'board': 935,
 'bowl': 936,
 'uses': 937,
 'apparently': 938,
 'building': 939,
 'act': 940,
 'staff': 941,
 'rest': 942,
 'pro': 943,
 'major': 944,
 'carolina': 945,
 'train': 946,
 'force': 947,
 'voice': 948,
 'pregnant': 949,
 'weight': 950,
 'officer': 951,
 'suggests': 952,
 'walking': 953,
 'google': 954,
 'members': 955,
 'rally': 956,
 'economy': 957,
 'ideas': 958,
 'lets': 959,
 'died': 960,
 'celebrate': 961,
 'card': 962,
 'website': 963,
 'park': 964,
 'ben': 965,
 'performance': 966,
 'prevent': 967,
 'williams': 968,
 'moore': 969,
 'moms': 970,
 '18': 971,
 'green': 972,
 'ferguson': 973,
 'saudi': 974,
 'asking': 975,
 'artist': 976,
 'leading': 977,
 'pick': 978,
 'passes': 979,
 'opening': 980,
 'said': 981,
 'christian': 982,
 'album': 983,
 'happens': 984,
 'netflix': 985,
 'culture': 986,
 'magazine': 987,
 'throws': 988,
 'leads': 989,
 'eyes': 990,
 'humans': 991,
 'chicken': 992,
 'beauty': 993,
 'crime': 994,
 'recalls': 995,
 'energy': 996,
 'voting': 997,
 'question': 998,
 'episode': 999,
 'swift': 1000,
 ...}

Q6. Create features and labels

X = tokenizer.texts_to_sequences(sarcasm_relevant_df['headline'])
X = pad_sequences(X, maxlen = maxlen)
y = np.asarray(sarcasm_relevant_df['is_sarcastic'])

print("Number of Feature samples:", len(X))
print(X[0])
print("Number of Labels: ", len(y))
print(y[0])

Number of Feature samples: 28619
[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0  354 3166 7473 2643    2
  660 1118]
Number of Labels:  28619
1

Q7. Get vocabulary size

vocabulary_size = len(tokenizer.word_index)
print (vocabulary_size)

Q8. Create a weight matrix using GloVe embeddings

glove_file = "glove.6B.zip"

#Extract Glove embedding zip file
with ZipFile(glove_file, 'r') as z:
  z.extractall()

EMBEDDING_FILE = './glove.6B.200d.txt'

embeddings = {}
for o in open(EMBEDDING_FILE):
    word = o.split(" ")[0]
    # print(word)
    embd = o.split(" ")[1:]
    embd = np.asarray(embd, dtype='float32')
    # print(embd)
    embeddings[word] = embd

embedding_matrix = np.zeros((vocabulary_size, 200))

for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings.get(word)
    if embedding_vector is not None:
        embedding_matrix[i-1] = embedding_vector

len(embeddings.values())

Q9. Define and compile a Bidirectional LSTM model

### Embedding layer for hint 
## model.add(Embedding(num_words, embedding_size, weights = [embedding_matrix]))
### Bidirectional LSTM layer for hint 
## model.add(Bidirectional(LSTM(128, return_sequences = True)))

input_layer = Input(shape=(maxlen,),dtype=tf.int64)
embed = Embedding(embedding_matrix.shape[0],output_dim=200,weights=[embedding_matrix],input_length=maxlen, trainable=True)(input_layer)
lstm=Bidirectional(LSTM(128))(embed)
drop=Dropout(0.3)(lstm)
dense =Dense(100,activation='relu')(drop)
out=Dense(2,activation='softmax')(dense)

Q10. Fit the model and check the validation accuracy

batch_size = 100
epochs = 5

model = Model(input_layer,out)
model.compile(loss='sparse_categorical_crossentropy',optimizer="adam",metrics=['accuracy'])
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 input_1 (InputLayer)        [(None, 926)]             0         
                                                                 
 embedding (Embedding)       (None, 926, 200)          6176800   
                                                                 
 bidirectional (Bidirectiona  (None, 256)              336896    
 l)                                                              
                                                                 
 dropout_8 (Dropout)         (None, 256)               0         
                                                                 
 dense_16 (Dense)            (None, 100)               25700     
                                                                 
 dense_17 (Dense)            (None, 2)                 202       
                                                                 
=================================================================
Total params: 6,539,598
Trainable params: 6,539,598
Non-trainable params: 0
_________________________________________________________________

tf.config.run_functions_eagerly(True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)
with tf.device('/device:GPU:0'):
    res = model.fit(X_train,y_train,batch_size=batch_size, epochs=epochs, verbose=1)

Epoch 1/5
229/229 [==============================] - 173s 745ms/step - loss: 0.5122 - accuracy: 0.7243
Epoch 2/5
229/229 [==============================] - 235s 1s/step - loss: 0.2859 - accuracy: 0.8794
Epoch 3/5
229/229 [==============================] - 167s 729ms/step - loss: 0.1933 - accuracy: 0.9240
Epoch 4/5
229/229 [==============================] - 165s 719ms/step - loss: 0.1287 - accuracy: 0.9537
Epoch 5/5
229/229 [==============================] - 162s 706ms/step - loss: 0.0852 - accuracy: 0.9716

/Users/kashmkj/micromamba/envs/tensorflow_gl/lib/python3.10/site-packages/tensorflow/python/data/ops/structured_function.py:254: UserWarning: Even though the `tf.config.experimental_run_functions_eagerly` option is set, this option does not apply to tf.data functions. To force eager execution of tf.data functions, please use `tf.data.experimental.enable_debug_mode()`.
  warnings.warn(

predictions = model.predict(np.array(X_test), verbose=1)

179/179 [==============================] - 12s 60ms/step

test_pred = [0 if i>j else 1 for i,j in predictions]

print(classification_report(y_test, test_pred))

              precision    recall  f1-score   support

           0       0.85      0.88      0.86      2977
           1       0.86      0.83      0.84      2747

    accuracy                           0.85      5724
   macro avg       0.85      0.85      0.85      5724
weighted avg       0.85      0.85      0.85      5724

We’ve got an accuracy of over 95%. We can further fine-tune the model, by running through more epochs, and adding more hidden layers, with dropout. The project ends here.

Customer Sentiment Analysis - Using Bidirectional LSTM : Assignment