import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from keras.datasets import imdb
from keras.preprocessing import sequence
from keras.utils import to_categorical
from keras import models
from keras import layers
import string
from keras.utils import pad_sequences
from keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense
PART A
Build a sequential NLP classifier which can use input text parameters to determine the customer sentiments Using Bidirectional LSTM
Importing libraries and setup
tf.config.list_physical_devices()[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
pd.set_option('display.expand_frame_repr', False)Q1. Import and analyse the data set
Importing the dataset from keras.datasets and instantiating the top 10000 words
most_frequent_words = 10000
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=most_frequent_words)x_train.shape, x_test.shape, y_train.shape, y_test.shape((25000,), (25000,), (25000,), (25000,))
Q2. Perform relevant sequence adding on the data
We’ll vectorize using one hot encoding and create a feature for the one hot encoded strings
def one_hot_encode(sequences, dimension = 10000):
results = np.zeros((len(sequences), dimension))
for i, sequence in enumerate(sequences):
results[i, sequence] = 1
return resultswhole_data = np.concatenate((x_train, x_test), axis=0)
label = np.concatenate((y_train, y_test), axis=0)print("Categories:", np.unique(label))
print("Number of unique words:", len(np.unique(np.hstack(whole_data))))Categories: [0 1]
Number of unique words: 9998
length = [len(i) for i in whole_data]
print("Average Review length:", np.mean(length))
print("Standard Deviation:", round(np.std(length)))Average Review length: 234.75892
Standard Deviation: 173
feature_ohe = one_hot_encode(whole_data)
label = np.array(label).astype("float32")Q3. Perform following data analysis
- Print shape of features and labels
- Print value of any one feature and it’s label
whole_data.shape, label.shape((50000,), (50000,))
print(f'Feature first = {feature_ohe} \n')
print(f'label first = {label[0]}')Feature first = [[0. 1. 1. ... 0. 0. 0.]
[0. 1. 1. ... 0. 0. 0.]
[0. 1. 1. ... 0. 0. 0.]
...
[0. 1. 1. ... 0. 0. 0.]
[0. 1. 1. ... 0. 0. 0.]
[0. 1. 1. ... 0. 0. 0.]]
label first = 1.0
Q4. Decode the feature value to get original sentence
len(x_train)
len(x_train[0])218
vocab_index = imdb.get_word_index()
vocab_index = { key:(value + 3) for key, value in vocab_index.items() }
vocab_index[''] = 0 # Padding
vocab_index['>'] = 1 # Start
vocab_index['?'] = 2 # Unknown word
reverse_word_dict = { value:key for key, value in vocab_index.items() }
reverse_vocab_index = dict([(value, key) for (key, value) in vocab_index.items()])
' '.join(reverse_word_dict[id] for id in x_train[2])"> this has to be one of the worst films of the 1990s when my friends i were watching this film being the target audience it was aimed at we just sat watched the first half an hour with our jaws touching the floor at how bad it really was the rest of the time everyone else in the theatre just started talking to each other leaving or generally crying into their popcorn that they actually paid money they had ? working to watch this feeble excuse for a film it must have looked like a great idea on paper but on film it looks like no one in the film has a clue what is going on crap acting crap costumes i can't get across how ? this is to watch save yourself an hour a bit of your life"
Q5. Design, train, tune and test a sequential model
max_review_length = 926
x_train = pad_sequences(x_train, maxlen=max_review_length)
x_test = pad_sequences(x_test, maxlen=max_review_length)x_train.shape, x_test.shape((25000, 926), (25000, 926))
%%time
embedding_vector_length = 32
model = Sequential()
model.add(Embedding(most_frequent_words, embedding_vector_length, input_length=max_review_length))
model.add(Flatten())
model.add(Dense(16, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())Model: "sequential_4"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
embedding_2 (Embedding) (None, 926, 32) 320000
flatten_1 (Flatten) (None, 29632) 0
dense_4 (Dense) (None, 16) 474128
dense_5 (Dense) (None, 16) 272
dense_6 (Dense) (None, 1) 17
=================================================================
Total params: 794,417
Trainable params: 794,417
Non-trainable params: 0
_________________________________________________________________
None
CPU times: user 50.5 ms, sys: 36.4 ms, total: 87 ms
Wall time: 111 ms
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)model.compile(
optimizer = "adam",
loss = "binary_crossentropy",
metrics = ["accuracy"]
)with tf.device('/device:GPU:0'):
results = model.fit(
x_train, y_train,
epochs= 10,
batch_size = 128,
validation_data = (x_test, y_test),
callbacks=[callback],
verbose=0
)_, train_acc = model.evaluate(x_train, y_train, verbose=1)
_, test_acc = model.evaluate(x_test, y_test, verbose=1)
print('Train: %.3f, Test: %.3f' % (train_acc, test_acc))782/782 [==============================] - 5s 6ms/step - loss: 8.7406e-05 - accuracy: 1.0000
782/782 [==============================] - 5s 6ms/step - loss: 0.6214 - accuracy: 0.8732
Train: 1.000, Test: 0.873
scores = model.evaluate(x_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1] * 100))Accuracy: 87.32%
Q6. Use the designed model to print the prediction on any one sample
preds = model.predict(x_test)782/782 [==============================] - 5s 6ms/step
preds[0]array([2.9718738e-05], dtype=float32)
def predict_sentiment(text):
# Prepare the input by removing punctuation characters, converting
# characters to lower case, and removing words containing numbers
translator = str.maketrans('', '', string.punctuation)
text = text.translate(translator)
text = text.lower().split(' ')
text = [word for word in text if word.isalpha()]
# Generate an input tensor
input = [1]
for word in text:
if word in vocab_index and vocab_index[word] < most_frequent_words:
input.append(vocab_index[word])
else:
input.append(2)
padded_input = pad_sequences([input], maxlen=926)
# Invoke the model and return the result
result = model.predict(np.array([padded_input][0]))[0][0]
return resultpredict_sentiment('Undoubtedly the most stellar experience I have ever watched.')1/1 [==============================] - 0s 25ms/step
0.9829639
predict_sentiment('I had a really bad experience with the customer executive.')1/1 [==============================] - 0s 17ms/step
0.1035999
PART B
Importing libraries
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Flatten, Bidirectional, GlobalMaxPool1D
from tensorflow.keras.models import Model, Sequential
from zipfile import ZipFile
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_reportQ1. Read and explore the data
sarcasm_df = pd.read_json('Sarcasm_Headlines_Dataset.json',lines=True)sarcasm_df.head(10)| is_sarcastic | headline | article_link | |
|---|---|---|---|
| 0 | 1 | thirtysomething scientists unveil doomsday clo... | https://www.theonion.com/thirtysomething-scien... |
| 1 | 0 | dem rep. totally nails why congress is falling... | https://www.huffingtonpost.com/entry/donna-edw... |
| 2 | 0 | eat your veggies: 9 deliciously different recipes | https://www.huffingtonpost.com/entry/eat-your-... |
| 3 | 1 | inclement weather prevents liar from getting t... | https://local.theonion.com/inclement-weather-p... |
| 4 | 1 | mother comes pretty close to using word 'strea... | https://www.theonion.com/mother-comes-pretty-c... |
| 5 | 0 | my white inheritance | https://www.huffingtonpost.com/entry/my-white-... |
| 6 | 0 | 5 ways to file your taxes with less stress | https://www.huffingtonpost.com/entry/5-ways-to... |
| 7 | 1 | richard branson's global-warming donation near... | https://www.theonion.com/richard-bransons-glob... |
| 8 | 1 | shadow government getting too large to meet in... | https://politics.theonion.com/shadow-governmen... |
| 9 | 0 | lots of parents know this scenario | https://www.huffingtonpost.comhttp://pubx.co/6... |
sarcasm_df.shape(28619, 3)
Looking at statistical summary for the dataframe
sarcasm_df.info()<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28619 entries, 0 to 28618
Data columns (total 3 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 is_sarcastic 28619 non-null int64
1 headline 28619 non-null object
2 article_link 28619 non-null object
dtypes: int64(1), object(2)
memory usage: 670.9+ KB
sarcasm_df.describe()| is_sarcastic | |
|---|---|
| count | 28619.000000 |
| mean | 0.476397 |
| std | 0.499451 |
| min | 0.000000 |
| 25% | 0.000000 |
| 50% | 0.000000 |
| 75% | 1.000000 |
| max | 1.000000 |
sarcasm_df.is_sarcastic.value_counts()is_sarcastic
0 14985
1 13634
Name: count, dtype: int64
The data looks almost perfectly balanced. Hence, we don’t need to perform any balancing of data.
Let’s Look at the first headline
sarcasm_df.loc[0]['headline']'thirtysomething scientists unveil doomsday clock of hair loss'
Q2. Retain relevant columns
For the current project, column article_link is not relevant. For other capstone projects, we can maybe scrape data on the whole article and perform detailed analysis, but this is out of the scope for this.
sarcasm_relevant_df = sarcasm_df.drop('article_link',axis=1)Q3. Get length of each sentence
length_each_sentence = [len(text) for text in sarcasm_relevant_df['headline']]Let’s look at the maximum length from the headline column
max_length_sentence = max(length_each_sentence)
max_length_sentence926
Q4. Define parameters
max_features = 10000
maxlen = max_length_sentence
embedding_size = 200
output_dim = 200Q5. Get indices for words
We’ll now tokenize the word, and get the index for the word using the tokenizer library.
tokenizer = Tokenizer(num_words=max_features,filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',lower=True,split=' ', char_level=False)
tokenizer.fit_on_texts(sarcasm_relevant_df['headline'])tokenizer.word_index{'to': 1,
'of': 2,
'the': 3,
'in': 4,
'for': 5,
'a': 6,
'on': 7,
'and': 8,
'with': 9,
'is': 10,
'new': 11,
'trump': 12,
'man': 13,
'at': 14,
'from': 15,
'about': 16,
'by': 17,
'after': 18,
'you': 19,
'this': 20,
'out': 21,
'up': 22,
'be': 23,
'as': 24,
'that': 25,
'it': 26,
'how': 27,
'not': 28,
'he': 29,
'his': 30,
'are': 31,
'your': 32,
'just': 33,
'what': 34,
'all': 35,
'who': 36,
'has': 37,
'will': 38,
'report': 39,
'into': 40,
'more': 41,
'one': 42,
'have': 43,
'year': 44,
'over': 45,
'why': 46,
'day': 47,
'u': 48,
'area': 49,
'woman': 50,
'can': 51,
's': 52,
'says': 53,
'donald': 54,
'time': 55,
'first': 56,
'like': 57,
'no': 58,
'her': 59,
'get': 60,
'off': 61,
'old': 62,
"trump's": 63,
'life': 64,
'now': 65,
'people': 66,
"'": 67,
'an': 68,
'house': 69,
'still': 70,
'obama': 71,
'white': 72,
'back': 73,
'make': 74,
'was': 75,
'than': 76,
'women': 77,
'if': 78,
'down': 79,
'when': 80,
'i': 81,
'my': 82,
'5': 83,
'clinton': 84,
'could': 85,
'they': 86,
'their': 87,
'before': 88,
'world': 89,
'him': 90,
'way': 91,
'americans': 92,
'family': 93,
'we': 94,
'study': 95,
'do': 96,
'would': 97,
'only': 98,
'most': 99,
'school': 100,
'gop': 101,
'being': 102,
'black': 103,
'years': 104,
'bill': 105,
"it's": 106,
'so': 107,
'finds': 108,
'really': 109,
'american': 110,
'best': 111,
'3': 112,
'last': 113,
'know': 114,
'but': 115,
'she': 116,
'should': 117,
'police': 118,
'nation': 119,
"can't": 120,
'10': 121,
'going': 122,
'watch': 123,
'during': 124,
'state': 125,
'death': 126,
'video': 127,
'or': 128,
'home': 129,
'president': 130,
'good': 131,
'every': 132,
'say': 133,
'show': 134,
'campaign': 135,
"'the": 136,
'health': 137,
'too': 138,
'big': 139,
'things': 140,
'mom': 141,
'getting': 142,
'2': 143,
'against': 144,
'may': 145,
'000': 146,
'right': 147,
'hillary': 148,
'love': 149,
'party': 150,
'work': 151,
'gets': 152,
'while': 153,
'some': 154,
'self': 155,
'need': 156,
'high': 157,
'little': 158,
'parents': 159,
'where': 160,
'never': 161,
'take': 162,
'kids': 163,
'through': 164,
"doesn't": 165,
'court': 166,
'makes': 167,
'john': 168,
'child': 169,
'change': 170,
'these': 171,
'other': 172,
'own': 173,
'news': 174,
'calls': 175,
'dead': 176,
'stop': 177,
'look': 178,
"he's": 179,
'want': 180,
'election': 181,
'next': 182,
'our': 183,
'gay': 184,
'local': 185,
"don't": 186,
'even': 187,
'4': 188,
'see': 189,
'go': 190,
'around': 191,
'america': 192,
'real': 193,
'takes': 194,
'war': 195,
'its': 196,
'7': 197,
"here's": 198,
'baby': 199,
'sex': 200,
"nation's": 201,
'them': 202,
'million': 203,
'again': 204,
'plan': 205,
'bush': 206,
'made': 207,
'two': 208,
'another': 209,
'6': 210,
'guy': 211,
'college': 212,
'dog': 213,
'office': 214,
'announces': 215,
'dad': 216,
'ever': 217,
'finally': 218,
'debate': 219,
'got': 220,
'week': 221,
'wants': 222,
'been': 223,
'long': 224,
'help': 225,
'much': 226,
'job': 227,
'1': 228,
'thing': 229,
'under': 230,
'there': 231,
'gun': 232,
'reveals': 233,
'night': 234,
'care': 235,
'actually': 236,
'couple': 237,
'congress': 238,
'live': 239,
'us': 240,
'north': 241,
'sexual': 242,
'trying': 243,
'money': 244,
'national': 245,
'shows': 246,
'god': 247,
"man's": 248,
'climate': 249,
'senate': 250,
'better': 251,
"won't": 252,
'star': 253,
'face': 254,
'without': 255,
'8': 256,
'had': 257,
'away': 258,
'food': 259,
'everyone': 260,
'9': 261,
'game': 262,
'season': 263,
'enough': 264,
'facebook': 265,
'anti': 266,
'give': 267,
'top': 268,
'20': 269,
'media': 270,
'paul': 271,
'making': 272,
'any': 273,
'law': 274,
'me': 275,
'bad': 276,
'teen': 277,
'shooting': 278,
'ways': 279,
'york': 280,
'end': 281,
'supreme': 282,
'movie': 283,
'free': 284,
'entire': 285,
'men': 286,
'history': 287,
'students': 288,
'children': 289,
'pope': 290,
'government': 291,
'single': 292,
'introduces': 293,
'business': 294,
'tell': 295,
'part': 296,
'body': 297,
'attack': 298,
'fight': 299,
'already': 300,
'think': 301,
'city': 302,
'tv': 303,
'story': 304,
'friends': 305,
'son': 306,
'deal': 307,
'fire': 308,
'same': 309,
'friend': 310,
'releases': 311,
'great': 312,
'must': 313,
'sanders': 314,
'line': 315,
'find': 316,
'found': 317,
'11': 318,
'book': 319,
'call': 320,
'pretty': 321,
'does': 322,
'former': 323,
'second': 324,
'car': 325,
'film': 326,
'company': 327,
'having': 328,
'come': 329,
'unveils': 330,
'use': 331,
'public': 332,
'support': 333,
'speech': 334,
'social': 335,
'wedding': 336,
'power': 337,
'presidential': 338,
"didn't": 339,
'middle': 340,
'keep': 341,
'behind': 342,
'run': 343,
'name': 344,
'case': 345,
'talk': 346,
'doing': 347,
'republican': 348,
'open': 349,
'coming': 350,
'girl': 351,
'photos': 352,
'fans': 353,
'scientists': 354,
'room': 355,
'looking': 356,
'security': 357,
'between': 358,
'human': 359,
'something': 360,
'morning': 361,
'full': 362,
'thinks': 363,
'fucking': 364,
'james': 365,
'voters': 366,
'rights': 367,
'asks': 368,
'republicans': 369,
'once': 370,
'might': 371,
'claims': 372,
'future': 373,
'used': 374,
'christmas': 375,
'tax': 376,
'email': 377,
'ceo': 378,
'student': 379,
'win': 380,
'forced': 381,
'admits': 382,
'goes': 383,
'group': 384,
'secret': 385,
'vote': 386,
'2016': 387,
"world's": 388,
'michael': 389,
'marriage': 390,
'because': 391,
'violence': 392,
'democrats': 393,
'poll': 394,
'killed': 395,
'team': 396,
'control': 397,
'ad': 398,
'country': 399,
'12': 400,
'sure': 401,
'department': 402,
'plans': 403,
'female': 404,
'ban': 405,
'many': 406,
'bernie': 407,
'teacher': 408,
'inside': 409,
'person': 410,
'post': 411,
'until': 412,
'wife': 413,
'ryan': 414,
'put': 415,
'always': 416,
'twitter': 417,
'super': 418,
'political': 419,
'water': 420,
'running': 421,
'hot': 422,
'dies': 423,
'meet': 424,
'father': 425,
'warns': 426,
'each': 427,
'head': 428,
'photo': 429,
'30': 430,
'eating': 431,
'boy': 432,
'reports': 433,
'minutes': 434,
'race': 435,
'judge': 436,
'red': 437,
'days': 438,
'music': 439,
'employee': 440,
'let': 441,
'perfect': 442,
'record': 443,
'tells': 444,
'taking': 445,
'candidate': 446,
'past': 447,
'everything': 448,
'art': 449,
'idea': 450,
'living': 451,
'list': 452,
'class': 453,
'summer': 454,
'month': 455,
'three': 456,
'missing': 457,
'wall': 458,
'were': 459,
'15': 460,
'working': 461,
'did': 462,
'here': 463,
'needs': 464,
'states': 465,
'secretary': 466,
'very': 467,
'looks': 468,
'save': 469,
'thousands': 470,
'mother': 471,
'service': 472,
'town': 473,
'mike': 474,
'pay': 475,
'russia': 476,
'george': 477,
'times': 478,
'left': 479,
'shot': 480,
'thought': 481,
'california': 482,
'phone': 483,
"'i": 484,
'heart': 485,
'start': 486,
'lives': 487,
'hours': 488,
'set': 489,
'place': 490,
'age': 491,
'wrong': 492,
'gives': 493,
'comes': 494,
'together': 495,
'cruz': 496,
'meeting': 497,
'shit': 498,
'ready': 499,
'officials': 500,
'justice': 501,
'ice': 502,
'cancer': 503,
'believe': 504,
"you're": 505,
'obamacare': 506,
'young': 507,
'probably': 508,
'texas': 509,
'someone': 510,
'talks': 511,
'wearing': 512,
'half': 513,
'breaking': 514,
'50': 515,
'giving': 516,
'street': 517,
'kill': 518,
'lost': 519,
'yet': 520,
'ex': 521,
'king': 522,
'chief': 523,
'korea': 524,
'watching': 525,
'few': 526,
'kim': 527,
'small': 528,
'drug': 529,
'air': 530,
'owner': 531,
'prison': 532,
"women's": 533,
'iran': 534,
'daughter': 535,
'leave': 536,
'isis': 537,
'restaurant': 538,
'fbi': 539,
'today': 540,
'dream': 541,
'fan': 542,
'feel': 543,
'ted': 544,
'crisis': 545,
'sleep': 546,
'letter': 547,
'mark': 548,
'word': 549,
'cat': 550,
"i'm": 551,
'wins': 552,
'administration': 553,
'south': 554,
'director': 555,
"she's": 556,
'biden': 557,
'hard': 558,
'earth': 559,
'hour': 560,
'tips': 561,
'education': 562,
'washington': 563,
'third': 564,
'military': 565,
'personal': 566,
'community': 567,
'nothing': 568,
'attacks': 569,
'outside': 570,
'using': 571,
'less': 572,
'chris': 573,
'talking': 574,
'democratic': 575,
'bar': 576,
'system': 577,
'rock': 578,
'internet': 579,
'questions': 580,
'francis': 581,
'nuclear': 582,
'months': 583,
'leaves': 584,
'kind': 585,
'federal': 586,
'romney': 587,
'those': 588,
'move': 589,
't': 590,
'well': 591,
'following': 592,
'percent': 593,
'online': 594,
'latest': 595,
'majority': 596,
'order': 597,
'birthday': 598,
'issues': 599,
'march': 600,
'tweets': 601,
'girlfriend': 602,
'fun': 603,
'fox': 604,
"what's": 605,
'assault': 606,
'since': 607,
'excited': 608,
'gift': 609,
'investigation': 610,
'knows': 611,
'congressman': 612,
'abortion': 613,
'lot': 614,
'florida': 615,
'series': 616,
'buy': 617,
'store': 618,
'straight': 619,
'minute': 620,
'rules': 621,
'guide': 622,
'hit': 623,
'read': 624,
'special': 625,
'cover': 626,
'mueller': 627,
'beautiful': 628,
'holiday': 629,
'waiting': 630,
'happy': 631,
'spends': 632,
'100': 633,
'stephen': 634,
'called': 635,
'reason': 636,
'huge': 637,
'travel': 638,
'muslim': 639,
'trip': 640,
'ask': 641,
'problem': 642,
'offers': 643,
'kid': 644,
'leaders': 645,
'different': 646,
'told': 647,
"isn't": 648,
'whole': 649,
'visit': 650,
'russian': 651,
'break': 652,
'scott': 653,
'hollywood': 654,
'front': 655,
'favorite': 656,
'rise': 657,
'relationship': 658,
'worried': 659,
'hair': 660,
'david': 661,
'cop': 662,
'non': 663,
'chinese': 664,
'millions': 665,
'thinking': 666,
'trailer': 667,
'2015': 668,
'box': 669,
'celebrates': 670,
'girls': 671,
'anything': 672,
'play': 673,
'date': 674,
'immigration': 675,
'hate': 676,
'protest': 677,
'al': 678,
"america's": 679,
'late': 680,
'early': 681,
'career': 682,
'die': 683,
'response': 684,
'himself': 685,
"obama's": 686,
'taylor': 687,
'union': 688,
'stars': 689,
'china': 690,
'drunk': 691,
'fall': 692,
'birth': 693,
'struggling': 694,
'reasons': 695,
'billion': 696,
'united': 697,
'message': 698,
'massive': 699,
'weekend': 700,
'40': 701,
'candidates': 702,
'politics': 703,
'accused': 704,
'opens': 705,
'become': 706,
'least': 707,
'starting': 708,
'killing': 709,
'huffpost': 710,
'senator': 711,
'bring': 712,
'hands': 713,
'mass': 714,
'feels': 715,
'interview': 716,
'vows': 717,
'hope': 718,
'victims': 719,
'experts': 720,
'light': 721,
'moment': 722,
'discover': 723,
'turn': 724,
'returns': 725,
'leader': 726,
'words': 727,
'pence': 728,
'point': 729,
'c': 730,
'jimmy': 731,
'clearly': 732,
'turns': 733,
'far': 734,
'driving': 735,
'center': 736,
'dating': 737,
'key': 738,
'employees': 739,
'sports': 740,
'policy': 741,
'lessons': 742,
'tom': 743,
'sick': 744,
'wishes': 745,
'apple': 746,
'conversation': 747,
'host': 748,
'sign': 749,
'whether': 750,
"they're": 751,
'completely': 752,
'murder': 753,
'fashion': 754,
'adds': 755,
'adorable': 756,
'totally': 757,
'learned': 758,
'signs': 759,
'hoping': 760,
'prince': 761,
'fuck': 762,
'role': 763,
'stage': 764,
'j': 765,
'oil': 766,
'powerful': 767,
'k': 768,
'breaks': 769,
'abuse': 770,
'song': 771,
'across': 772,
'reality': 773,
'moving': 774,
'decision': 775,
'global': 776,
'keeps': 777,
'experience': 778,
'seen': 779,
'syrian': 780,
'announce': 781,
'true': 782,
'risk': 783,
'iraq': 784,
'joe': 785,
'bus': 786,
'begins': 787,
'jr': 788,
'syria': 789,
'hand': 790,
'puts': 791,
'dance': 792,
'13': 793,
'final': 794,
'check': 795,
'cops': 796,
'dinner': 797,
'stand': 798,
'apartment': 799,
'cut': 800,
'almost': 801,
'playing': 802,
'names': 803,
'coffee': 804,
'weird': 805,
'longer': 806,
'hurricane': 807,
'schools': 808,
'kills': 809,
"there's": 810,
'lose': 811,
'amazon': 812,
'awards': 813,
'un': 814,
'low': 815,
'press': 816,
'robert': 817,
'mind': 818,
'number': 819,
'lead': 820,
'west': 821,
'd': 822,
'worth': 823,
'anniversary': 824,
'surprise': 825,
'oscar': 826,
'feeling': 827,
'anyone': 828,
'crash': 829,
'mental': 830,
'trans': 831,
'test': 832,
'lgbt': 833,
'worst': 834,
'band': 835,
'hall': 836,
'space': 837,
'return': 838,
'official': 839,
'which': 840,
'audience': 841,
'queer': 842,
'shop': 843,
'hits': 844,
'demands': 845,
'oscars': 846,
'data': 847,
'workers': 848,
'university': 849,
'iowa': 850,
'side': 851,
'apologizes': 852,
'road': 853,
'cool': 854,
'industry': 855,
'suspect': 856,
'planned': 857,
'plane': 858,
'nfl': 859,
'evidence': 860,
'steve': 861,
'eat': 862,
'program': 863,
'door': 864,
'table': 865,
'governor': 866,
'remember': 867,
'try': 868,
'important': 869,
'chance': 870,
'transgender': 871,
'defense': 872,
'reportedly': 873,
'urges': 874,
'near': 875,
'church': 876,
'chicago': 877,
'halloween': 878,
'users': 879,
'possible': 880,
'general': 881,
'doctor': 882,
'2014': 883,
'hear': 884,
'style': 885,
'spot': 886,
'supporters': 887,
'five': 888,
'rubio': 889,
'reveal': 890,
'kardashian': 891,
'advice': 892,
'coworker': 893,
'given': 894,
'voter': 895,
'brings': 896,
'picture': 897,
'dying': 898,
'success': 899,
'blood': 900,
'executive': 901,
'finding': 902,
'suicide': 903,
'tour': 904,
'biggest': 905,
'2017': 906,
'allegations': 907,
'accidentally': 908,
'easy': 909,
'boys': 910,
'football': 911,
'protesters': 912,
'bathroom': 913,
'fighting': 914,
'demand': 915,
'michelle': 916,
'peace': 917,
'push': 918,
'reform': 919,
'paris': 920,
'close': 921,
'avoid': 922,
'elizabeth': 923,
'israel': 924,
'homeless': 925,
'poor': 926,
'fear': 927,
'cnn': 928,
'quietly': 929,
'reminds': 930,
'teens': 931,
'nyc': 932,
'address': 933,
'learn': 934,
'board': 935,
'bowl': 936,
'uses': 937,
'apparently': 938,
'building': 939,
'act': 940,
'staff': 941,
'rest': 942,
'pro': 943,
'major': 944,
'carolina': 945,
'train': 946,
'force': 947,
'voice': 948,
'pregnant': 949,
'weight': 950,
'officer': 951,
'suggests': 952,
'walking': 953,
'google': 954,
'members': 955,
'rally': 956,
'economy': 957,
'ideas': 958,
'lets': 959,
'died': 960,
'celebrate': 961,
'card': 962,
'website': 963,
'park': 964,
'ben': 965,
'performance': 966,
'prevent': 967,
'williams': 968,
'moore': 969,
'moms': 970,
'18': 971,
'green': 972,
'ferguson': 973,
'saudi': 974,
'asking': 975,
'artist': 976,
'leading': 977,
'pick': 978,
'passes': 979,
'opening': 980,
'said': 981,
'christian': 982,
'album': 983,
'happens': 984,
'netflix': 985,
'culture': 986,
'magazine': 987,
'throws': 988,
'leads': 989,
'eyes': 990,
'humans': 991,
'chicken': 992,
'beauty': 993,
'crime': 994,
'recalls': 995,
'energy': 996,
'voting': 997,
'question': 998,
'episode': 999,
'swift': 1000,
...}
Q6. Create features and labels
X = tokenizer.texts_to_sequences(sarcasm_relevant_df['headline'])
X = pad_sequences(X, maxlen = maxlen)
y = np.asarray(sarcasm_relevant_df['is_sarcastic'])
print("Number of Feature samples:", len(X))
print(X[0])
print("Number of Labels: ", len(y))
print(y[0])Number of Feature samples: 28619
[ 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 354 3166 7473 2643 2
660 1118]
Number of Labels: 28619
1
Q7. Get vocabulary size
vocabulary_size = len(tokenizer.word_index)
print (vocabulary_size)30884
Q8. Create a weight matrix using GloVe embeddings
glove_file = "glove.6B.zip"
#Extract Glove embedding zip file
with ZipFile(glove_file, 'r') as z:
z.extractall()EMBEDDING_FILE = './glove.6B.200d.txt'
embeddings = {}
for o in open(EMBEDDING_FILE):
word = o.split(" ")[0]
# print(word)
embd = o.split(" ")[1:]
embd = np.asarray(embd, dtype='float32')
# print(embd)
embeddings[word] = embdembedding_matrix = np.zeros((vocabulary_size, 200))
for word, i in tokenizer.word_index.items():
embedding_vector = embeddings.get(word)
if embedding_vector is not None:
embedding_matrix[i-1] = embedding_vector
len(embeddings.values())400000
Q9. Define and compile a Bidirectional LSTM model
### Embedding layer for hint
## model.add(Embedding(num_words, embedding_size, weights = [embedding_matrix]))
### Bidirectional LSTM layer for hint
## model.add(Bidirectional(LSTM(128, return_sequences = True)))
input_layer = Input(shape=(maxlen,),dtype=tf.int64)
embed = Embedding(embedding_matrix.shape[0],output_dim=200,weights=[embedding_matrix],input_length=maxlen, trainable=True)(input_layer)
lstm=Bidirectional(LSTM(128))(embed)
drop=Dropout(0.3)(lstm)
dense =Dense(100,activation='relu')(drop)
out=Dense(2,activation='softmax')(dense)Q10. Fit the model and check the validation accuracy
batch_size = 100
epochs = 5
model = Model(input_layer,out)
model.compile(loss='sparse_categorical_crossentropy',optimizer="adam",metrics=['accuracy'])
model.summary()Model: "model"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
input_1 (InputLayer) [(None, 926)] 0
embedding (Embedding) (None, 926, 200) 6176800
bidirectional (Bidirectiona (None, 256) 336896
l)
dropout_8 (Dropout) (None, 256) 0
dense_16 (Dense) (None, 100) 25700
dense_17 (Dense) (None, 2) 202
=================================================================
Total params: 6,539,598
Trainable params: 6,539,598
Non-trainable params: 0
_________________________________________________________________
tf.config.run_functions_eagerly(True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)
with tf.device('/device:GPU:0'):
res = model.fit(X_train,y_train,batch_size=batch_size, epochs=epochs, verbose=1)Epoch 1/5
229/229 [==============================] - 173s 745ms/step - loss: 0.5122 - accuracy: 0.7243
Epoch 2/5
229/229 [==============================] - 235s 1s/step - loss: 0.2859 - accuracy: 0.8794
Epoch 3/5
229/229 [==============================] - 167s 729ms/step - loss: 0.1933 - accuracy: 0.9240
Epoch 4/5
229/229 [==============================] - 165s 719ms/step - loss: 0.1287 - accuracy: 0.9537
Epoch 5/5
229/229 [==============================] - 162s 706ms/step - loss: 0.0852 - accuracy: 0.9716
/Users/kashmkj/micromamba/envs/tensorflow_gl/lib/python3.10/site-packages/tensorflow/python/data/ops/structured_function.py:254: UserWarning: Even though the `tf.config.experimental_run_functions_eagerly` option is set, this option does not apply to tf.data functions. To force eager execution of tf.data functions, please use `tf.data.experimental.enable_debug_mode()`.
warnings.warn(
predictions = model.predict(np.array(X_test), verbose=1)179/179 [==============================] - 12s 60ms/step
test_pred = [0 if i>j else 1 for i,j in predictions]print(classification_report(y_test, test_pred)) precision recall f1-score support
0 0.85 0.88 0.86 2977
1 0.86 0.83 0.84 2747
accuracy 0.85 5724
macro avg 0.85 0.85 0.85 5724
weighted avg 0.85 0.85 0.85 5724
We’ve got an accuracy of over 95%. We can further fine-tune the model, by running through more epochs, and adding more hidden layers, with dropout. The project ends here.