import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from keras.datasets import imdb
from keras.preprocessing import sequence
from keras.utils import to_categorical
from keras import models
from keras import layers
import string
from keras.utils import pad_sequences
from keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense
PART A
Build a sequential NLP classifier which can use input text parameters to determine the customer sentiments Using Bidirectional LSTM
Importing libraries and setup
tf.config.list_physical_devices()
[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
'display.expand_frame_repr', False) pd.set_option(
Q1. Import and analyse the data set
Importing the dataset from keras.datasets and instantiating the top 10000 words
= 10000
most_frequent_words = imdb.load_data(num_words=most_frequent_words) (x_train, y_train), (x_test, y_test)
x_train.shape, x_test.shape, y_train.shape, y_test.shape
((25000,), (25000,), (25000,), (25000,))
Q2. Perform relevant sequence adding on the data
We’ll vectorize using one hot encoding and create a feature for the one hot encoded strings
def one_hot_encode(sequences, dimension = 10000):
= np.zeros((len(sequences), dimension))
results for i, sequence in enumerate(sequences):
= 1
results[i, sequence] return results
= np.concatenate((x_train, x_test), axis=0)
whole_data = np.concatenate((y_train, y_test), axis=0) label
print("Categories:", np.unique(label))
print("Number of unique words:", len(np.unique(np.hstack(whole_data))))
Categories: [0 1]
Number of unique words: 9998
= [len(i) for i in whole_data]
length print("Average Review length:", np.mean(length))
print("Standard Deviation:", round(np.std(length)))
Average Review length: 234.75892
Standard Deviation: 173
= one_hot_encode(whole_data)
feature_ohe = np.array(label).astype("float32") label
Q3. Perform following data analysis
- Print shape of features and labels
- Print value of any one feature and it’s label
whole_data.shape, label.shape
((50000,), (50000,))
print(f'Feature first = {feature_ohe} \n')
print(f'label first = {label[0]}')
Feature first = [[0. 1. 1. ... 0. 0. 0.]
[0. 1. 1. ... 0. 0. 0.]
[0. 1. 1. ... 0. 0. 0.]
...
[0. 1. 1. ... 0. 0. 0.]
[0. 1. 1. ... 0. 0. 0.]
[0. 1. 1. ... 0. 0. 0.]]
label first = 1.0
Q4. Decode the feature value to get original sentence
len(x_train)
len(x_train[0])
218
= imdb.get_word_index()
vocab_index = { key:(value + 3) for key, value in vocab_index.items() }
vocab_index ''] = 0 # Padding
vocab_index['>'] = 1 # Start
vocab_index['?'] = 2 # Unknown word
vocab_index[= { value:key for key, value in vocab_index.items() }
reverse_word_dict
= dict([(value, key) for (key, value) in vocab_index.items()])
reverse_vocab_index
' '.join(reverse_word_dict[id] for id in x_train[2])
"> this has to be one of the worst films of the 1990s when my friends i were watching this film being the target audience it was aimed at we just sat watched the first half an hour with our jaws touching the floor at how bad it really was the rest of the time everyone else in the theatre just started talking to each other leaving or generally crying into their popcorn that they actually paid money they had ? working to watch this feeble excuse for a film it must have looked like a great idea on paper but on film it looks like no one in the film has a clue what is going on crap acting crap costumes i can't get across how ? this is to watch save yourself an hour a bit of your life"
Q5. Design, train, tune and test a sequential model
= 926
max_review_length = pad_sequences(x_train, maxlen=max_review_length)
x_train = pad_sequences(x_test, maxlen=max_review_length) x_test
x_train.shape, x_test.shape
((25000, 926), (25000, 926))
%%time
= 32
embedding_vector_length = Sequential()
model =max_review_length))
model.add(Embedding(most_frequent_words, embedding_vector_length, input_length
model.add(Flatten())16, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.add(Dense(compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.print(model.summary())
Model: "sequential_4"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
embedding_2 (Embedding) (None, 926, 32) 320000
flatten_1 (Flatten) (None, 29632) 0
dense_4 (Dense) (None, 16) 474128
dense_5 (Dense) (None, 16) 272
dense_6 (Dense) (None, 1) 17
=================================================================
Total params: 794,417
Trainable params: 794,417
Non-trainable params: 0
_________________________________________________________________
None
CPU times: user 50.5 ms, sys: 36.4 ms, total: 87 ms
Wall time: 111 ms
= tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3) callback
compile(
model.= "adam",
optimizer = "binary_crossentropy",
loss = ["accuracy"]
metrics )
with tf.device('/device:GPU:0'):
= model.fit(
results
x_train, y_train,= 10,
epochs= 128,
batch_size = (x_test, y_test),
validation_data =[callback],
callbacks=0
verbose )
= model.evaluate(x_train, y_train, verbose=1)
_, train_acc = model.evaluate(x_test, y_test, verbose=1)
_, test_acc print('Train: %.3f, Test: %.3f' % (train_acc, test_acc))
782/782 [==============================] - 5s 6ms/step - loss: 8.7406e-05 - accuracy: 1.0000
782/782 [==============================] - 5s 6ms/step - loss: 0.6214 - accuracy: 0.8732
Train: 1.000, Test: 0.873
= model.evaluate(x_test, y_test, verbose=0)
scores print("Accuracy: %.2f%%" % (scores[1] * 100))
Accuracy: 87.32%
Q6. Use the designed model to print the prediction on any one sample
= model.predict(x_test) preds
782/782 [==============================] - 5s 6ms/step
0] preds[
array([2.9718738e-05], dtype=float32)
def predict_sentiment(text):
# Prepare the input by removing punctuation characters, converting
# characters to lower case, and removing words containing numbers
= str.maketrans('', '', string.punctuation)
translator = text.translate(translator)
text = text.lower().split(' ')
text = [word for word in text if word.isalpha()]
text
# Generate an input tensor
input = [1]
for word in text:
if word in vocab_index and vocab_index[word] < most_frequent_words:
input.append(vocab_index[word])
else:
input.append(2)
= pad_sequences([input], maxlen=926)
padded_input
# Invoke the model and return the result
= model.predict(np.array([padded_input][0]))[0][0]
result return result
'Undoubtedly the most stellar experience I have ever watched.') predict_sentiment(
1/1 [==============================] - 0s 25ms/step
0.9829639
'I had a really bad experience with the customer executive.') predict_sentiment(
1/1 [==============================] - 0s 17ms/step
0.1035999
PART B
Importing libraries
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Flatten, Bidirectional, GlobalMaxPool1D
from tensorflow.keras.models import Model, Sequential
from zipfile import ZipFile
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
Q1. Read and explore the data
= pd.read_json('Sarcasm_Headlines_Dataset.json',lines=True) sarcasm_df
10) sarcasm_df.head(
is_sarcastic | headline | article_link | |
---|---|---|---|
0 | 1 | thirtysomething scientists unveil doomsday clo... | https://www.theonion.com/thirtysomething-scien... |
1 | 0 | dem rep. totally nails why congress is falling... | https://www.huffingtonpost.com/entry/donna-edw... |
2 | 0 | eat your veggies: 9 deliciously different recipes | https://www.huffingtonpost.com/entry/eat-your-... |
3 | 1 | inclement weather prevents liar from getting t... | https://local.theonion.com/inclement-weather-p... |
4 | 1 | mother comes pretty close to using word 'strea... | https://www.theonion.com/mother-comes-pretty-c... |
5 | 0 | my white inheritance | https://www.huffingtonpost.com/entry/my-white-... |
6 | 0 | 5 ways to file your taxes with less stress | https://www.huffingtonpost.com/entry/5-ways-to... |
7 | 1 | richard branson's global-warming donation near... | https://www.theonion.com/richard-bransons-glob... |
8 | 1 | shadow government getting too large to meet in... | https://politics.theonion.com/shadow-governmen... |
9 | 0 | lots of parents know this scenario | https://www.huffingtonpost.comhttp://pubx.co/6... |
sarcasm_df.shape
(28619, 3)
Looking at statistical summary for the dataframe
sarcasm_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28619 entries, 0 to 28618
Data columns (total 3 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 is_sarcastic 28619 non-null int64
1 headline 28619 non-null object
2 article_link 28619 non-null object
dtypes: int64(1), object(2)
memory usage: 670.9+ KB
sarcasm_df.describe()
is_sarcastic | |
---|---|
count | 28619.000000 |
mean | 0.476397 |
std | 0.499451 |
min | 0.000000 |
25% | 0.000000 |
50% | 0.000000 |
75% | 1.000000 |
max | 1.000000 |
sarcasm_df.is_sarcastic.value_counts()
is_sarcastic
0 14985
1 13634
Name: count, dtype: int64
The data looks almost perfectly balanced. Hence, we don’t need to perform any balancing of data.
Let’s Look at the first headline
0]['headline'] sarcasm_df.loc[
'thirtysomething scientists unveil doomsday clock of hair loss'
Q2. Retain relevant columns
For the current project, column article_link
is not relevant. For other capstone projects, we can maybe scrape data on the whole article and perform detailed analysis, but this is out of the scope for this.
= sarcasm_df.drop('article_link',axis=1) sarcasm_relevant_df
Q3. Get length of each sentence
= [len(text) for text in sarcasm_relevant_df['headline']] length_each_sentence
Let’s look at the maximum length from the headline column
= max(length_each_sentence)
max_length_sentence max_length_sentence
926
Q4. Define parameters
= 10000
max_features = max_length_sentence
maxlen = 200
embedding_size = 200 output_dim
Q5. Get indices for words
We’ll now tokenize the word, and get the index for the word using the tokenizer library.
= Tokenizer(num_words=max_features,filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',lower=True,split=' ', char_level=False)
tokenizer 'headline']) tokenizer.fit_on_texts(sarcasm_relevant_df[
tokenizer.word_index
{'to': 1,
'of': 2,
'the': 3,
'in': 4,
'for': 5,
'a': 6,
'on': 7,
'and': 8,
'with': 9,
'is': 10,
'new': 11,
'trump': 12,
'man': 13,
'at': 14,
'from': 15,
'about': 16,
'by': 17,
'after': 18,
'you': 19,
'this': 20,
'out': 21,
'up': 22,
'be': 23,
'as': 24,
'that': 25,
'it': 26,
'how': 27,
'not': 28,
'he': 29,
'his': 30,
'are': 31,
'your': 32,
'just': 33,
'what': 34,
'all': 35,
'who': 36,
'has': 37,
'will': 38,
'report': 39,
'into': 40,
'more': 41,
'one': 42,
'have': 43,
'year': 44,
'over': 45,
'why': 46,
'day': 47,
'u': 48,
'area': 49,
'woman': 50,
'can': 51,
's': 52,
'says': 53,
'donald': 54,
'time': 55,
'first': 56,
'like': 57,
'no': 58,
'her': 59,
'get': 60,
'off': 61,
'old': 62,
"trump's": 63,
'life': 64,
'now': 65,
'people': 66,
"'": 67,
'an': 68,
'house': 69,
'still': 70,
'obama': 71,
'white': 72,
'back': 73,
'make': 74,
'was': 75,
'than': 76,
'women': 77,
'if': 78,
'down': 79,
'when': 80,
'i': 81,
'my': 82,
'5': 83,
'clinton': 84,
'could': 85,
'they': 86,
'their': 87,
'before': 88,
'world': 89,
'him': 90,
'way': 91,
'americans': 92,
'family': 93,
'we': 94,
'study': 95,
'do': 96,
'would': 97,
'only': 98,
'most': 99,
'school': 100,
'gop': 101,
'being': 102,
'black': 103,
'years': 104,
'bill': 105,
"it's": 106,
'so': 107,
'finds': 108,
'really': 109,
'american': 110,
'best': 111,
'3': 112,
'last': 113,
'know': 114,
'but': 115,
'she': 116,
'should': 117,
'police': 118,
'nation': 119,
"can't": 120,
'10': 121,
'going': 122,
'watch': 123,
'during': 124,
'state': 125,
'death': 126,
'video': 127,
'or': 128,
'home': 129,
'president': 130,
'good': 131,
'every': 132,
'say': 133,
'show': 134,
'campaign': 135,
"'the": 136,
'health': 137,
'too': 138,
'big': 139,
'things': 140,
'mom': 141,
'getting': 142,
'2': 143,
'against': 144,
'may': 145,
'000': 146,
'right': 147,
'hillary': 148,
'love': 149,
'party': 150,
'work': 151,
'gets': 152,
'while': 153,
'some': 154,
'self': 155,
'need': 156,
'high': 157,
'little': 158,
'parents': 159,
'where': 160,
'never': 161,
'take': 162,
'kids': 163,
'through': 164,
"doesn't": 165,
'court': 166,
'makes': 167,
'john': 168,
'child': 169,
'change': 170,
'these': 171,
'other': 172,
'own': 173,
'news': 174,
'calls': 175,
'dead': 176,
'stop': 177,
'look': 178,
"he's": 179,
'want': 180,
'election': 181,
'next': 182,
'our': 183,
'gay': 184,
'local': 185,
"don't": 186,
'even': 187,
'4': 188,
'see': 189,
'go': 190,
'around': 191,
'america': 192,
'real': 193,
'takes': 194,
'war': 195,
'its': 196,
'7': 197,
"here's": 198,
'baby': 199,
'sex': 200,
"nation's": 201,
'them': 202,
'million': 203,
'again': 204,
'plan': 205,
'bush': 206,
'made': 207,
'two': 208,
'another': 209,
'6': 210,
'guy': 211,
'college': 212,
'dog': 213,
'office': 214,
'announces': 215,
'dad': 216,
'ever': 217,
'finally': 218,
'debate': 219,
'got': 220,
'week': 221,
'wants': 222,
'been': 223,
'long': 224,
'help': 225,
'much': 226,
'job': 227,
'1': 228,
'thing': 229,
'under': 230,
'there': 231,
'gun': 232,
'reveals': 233,
'night': 234,
'care': 235,
'actually': 236,
'couple': 237,
'congress': 238,
'live': 239,
'us': 240,
'north': 241,
'sexual': 242,
'trying': 243,
'money': 244,
'national': 245,
'shows': 246,
'god': 247,
"man's": 248,
'climate': 249,
'senate': 250,
'better': 251,
"won't": 252,
'star': 253,
'face': 254,
'without': 255,
'8': 256,
'had': 257,
'away': 258,
'food': 259,
'everyone': 260,
'9': 261,
'game': 262,
'season': 263,
'enough': 264,
'facebook': 265,
'anti': 266,
'give': 267,
'top': 268,
'20': 269,
'media': 270,
'paul': 271,
'making': 272,
'any': 273,
'law': 274,
'me': 275,
'bad': 276,
'teen': 277,
'shooting': 278,
'ways': 279,
'york': 280,
'end': 281,
'supreme': 282,
'movie': 283,
'free': 284,
'entire': 285,
'men': 286,
'history': 287,
'students': 288,
'children': 289,
'pope': 290,
'government': 291,
'single': 292,
'introduces': 293,
'business': 294,
'tell': 295,
'part': 296,
'body': 297,
'attack': 298,
'fight': 299,
'already': 300,
'think': 301,
'city': 302,
'tv': 303,
'story': 304,
'friends': 305,
'son': 306,
'deal': 307,
'fire': 308,
'same': 309,
'friend': 310,
'releases': 311,
'great': 312,
'must': 313,
'sanders': 314,
'line': 315,
'find': 316,
'found': 317,
'11': 318,
'book': 319,
'call': 320,
'pretty': 321,
'does': 322,
'former': 323,
'second': 324,
'car': 325,
'film': 326,
'company': 327,
'having': 328,
'come': 329,
'unveils': 330,
'use': 331,
'public': 332,
'support': 333,
'speech': 334,
'social': 335,
'wedding': 336,
'power': 337,
'presidential': 338,
"didn't": 339,
'middle': 340,
'keep': 341,
'behind': 342,
'run': 343,
'name': 344,
'case': 345,
'talk': 346,
'doing': 347,
'republican': 348,
'open': 349,
'coming': 350,
'girl': 351,
'photos': 352,
'fans': 353,
'scientists': 354,
'room': 355,
'looking': 356,
'security': 357,
'between': 358,
'human': 359,
'something': 360,
'morning': 361,
'full': 362,
'thinks': 363,
'fucking': 364,
'james': 365,
'voters': 366,
'rights': 367,
'asks': 368,
'republicans': 369,
'once': 370,
'might': 371,
'claims': 372,
'future': 373,
'used': 374,
'christmas': 375,
'tax': 376,
'email': 377,
'ceo': 378,
'student': 379,
'win': 380,
'forced': 381,
'admits': 382,
'goes': 383,
'group': 384,
'secret': 385,
'vote': 386,
'2016': 387,
"world's": 388,
'michael': 389,
'marriage': 390,
'because': 391,
'violence': 392,
'democrats': 393,
'poll': 394,
'killed': 395,
'team': 396,
'control': 397,
'ad': 398,
'country': 399,
'12': 400,
'sure': 401,
'department': 402,
'plans': 403,
'female': 404,
'ban': 405,
'many': 406,
'bernie': 407,
'teacher': 408,
'inside': 409,
'person': 410,
'post': 411,
'until': 412,
'wife': 413,
'ryan': 414,
'put': 415,
'always': 416,
'twitter': 417,
'super': 418,
'political': 419,
'water': 420,
'running': 421,
'hot': 422,
'dies': 423,
'meet': 424,
'father': 425,
'warns': 426,
'each': 427,
'head': 428,
'photo': 429,
'30': 430,
'eating': 431,
'boy': 432,
'reports': 433,
'minutes': 434,
'race': 435,
'judge': 436,
'red': 437,
'days': 438,
'music': 439,
'employee': 440,
'let': 441,
'perfect': 442,
'record': 443,
'tells': 444,
'taking': 445,
'candidate': 446,
'past': 447,
'everything': 448,
'art': 449,
'idea': 450,
'living': 451,
'list': 452,
'class': 453,
'summer': 454,
'month': 455,
'three': 456,
'missing': 457,
'wall': 458,
'were': 459,
'15': 460,
'working': 461,
'did': 462,
'here': 463,
'needs': 464,
'states': 465,
'secretary': 466,
'very': 467,
'looks': 468,
'save': 469,
'thousands': 470,
'mother': 471,
'service': 472,
'town': 473,
'mike': 474,
'pay': 475,
'russia': 476,
'george': 477,
'times': 478,
'left': 479,
'shot': 480,
'thought': 481,
'california': 482,
'phone': 483,
"'i": 484,
'heart': 485,
'start': 486,
'lives': 487,
'hours': 488,
'set': 489,
'place': 490,
'age': 491,
'wrong': 492,
'gives': 493,
'comes': 494,
'together': 495,
'cruz': 496,
'meeting': 497,
'shit': 498,
'ready': 499,
'officials': 500,
'justice': 501,
'ice': 502,
'cancer': 503,
'believe': 504,
"you're": 505,
'obamacare': 506,
'young': 507,
'probably': 508,
'texas': 509,
'someone': 510,
'talks': 511,
'wearing': 512,
'half': 513,
'breaking': 514,
'50': 515,
'giving': 516,
'street': 517,
'kill': 518,
'lost': 519,
'yet': 520,
'ex': 521,
'king': 522,
'chief': 523,
'korea': 524,
'watching': 525,
'few': 526,
'kim': 527,
'small': 528,
'drug': 529,
'air': 530,
'owner': 531,
'prison': 532,
"women's": 533,
'iran': 534,
'daughter': 535,
'leave': 536,
'isis': 537,
'restaurant': 538,
'fbi': 539,
'today': 540,
'dream': 541,
'fan': 542,
'feel': 543,
'ted': 544,
'crisis': 545,
'sleep': 546,
'letter': 547,
'mark': 548,
'word': 549,
'cat': 550,
"i'm": 551,
'wins': 552,
'administration': 553,
'south': 554,
'director': 555,
"she's": 556,
'biden': 557,
'hard': 558,
'earth': 559,
'hour': 560,
'tips': 561,
'education': 562,
'washington': 563,
'third': 564,
'military': 565,
'personal': 566,
'community': 567,
'nothing': 568,
'attacks': 569,
'outside': 570,
'using': 571,
'less': 572,
'chris': 573,
'talking': 574,
'democratic': 575,
'bar': 576,
'system': 577,
'rock': 578,
'internet': 579,
'questions': 580,
'francis': 581,
'nuclear': 582,
'months': 583,
'leaves': 584,
'kind': 585,
'federal': 586,
'romney': 587,
'those': 588,
'move': 589,
't': 590,
'well': 591,
'following': 592,
'percent': 593,
'online': 594,
'latest': 595,
'majority': 596,
'order': 597,
'birthday': 598,
'issues': 599,
'march': 600,
'tweets': 601,
'girlfriend': 602,
'fun': 603,
'fox': 604,
"what's": 605,
'assault': 606,
'since': 607,
'excited': 608,
'gift': 609,
'investigation': 610,
'knows': 611,
'congressman': 612,
'abortion': 613,
'lot': 614,
'florida': 615,
'series': 616,
'buy': 617,
'store': 618,
'straight': 619,
'minute': 620,
'rules': 621,
'guide': 622,
'hit': 623,
'read': 624,
'special': 625,
'cover': 626,
'mueller': 627,
'beautiful': 628,
'holiday': 629,
'waiting': 630,
'happy': 631,
'spends': 632,
'100': 633,
'stephen': 634,
'called': 635,
'reason': 636,
'huge': 637,
'travel': 638,
'muslim': 639,
'trip': 640,
'ask': 641,
'problem': 642,
'offers': 643,
'kid': 644,
'leaders': 645,
'different': 646,
'told': 647,
"isn't": 648,
'whole': 649,
'visit': 650,
'russian': 651,
'break': 652,
'scott': 653,
'hollywood': 654,
'front': 655,
'favorite': 656,
'rise': 657,
'relationship': 658,
'worried': 659,
'hair': 660,
'david': 661,
'cop': 662,
'non': 663,
'chinese': 664,
'millions': 665,
'thinking': 666,
'trailer': 667,
'2015': 668,
'box': 669,
'celebrates': 670,
'girls': 671,
'anything': 672,
'play': 673,
'date': 674,
'immigration': 675,
'hate': 676,
'protest': 677,
'al': 678,
"america's": 679,
'late': 680,
'early': 681,
'career': 682,
'die': 683,
'response': 684,
'himself': 685,
"obama's": 686,
'taylor': 687,
'union': 688,
'stars': 689,
'china': 690,
'drunk': 691,
'fall': 692,
'birth': 693,
'struggling': 694,
'reasons': 695,
'billion': 696,
'united': 697,
'message': 698,
'massive': 699,
'weekend': 700,
'40': 701,
'candidates': 702,
'politics': 703,
'accused': 704,
'opens': 705,
'become': 706,
'least': 707,
'starting': 708,
'killing': 709,
'huffpost': 710,
'senator': 711,
'bring': 712,
'hands': 713,
'mass': 714,
'feels': 715,
'interview': 716,
'vows': 717,
'hope': 718,
'victims': 719,
'experts': 720,
'light': 721,
'moment': 722,
'discover': 723,
'turn': 724,
'returns': 725,
'leader': 726,
'words': 727,
'pence': 728,
'point': 729,
'c': 730,
'jimmy': 731,
'clearly': 732,
'turns': 733,
'far': 734,
'driving': 735,
'center': 736,
'dating': 737,
'key': 738,
'employees': 739,
'sports': 740,
'policy': 741,
'lessons': 742,
'tom': 743,
'sick': 744,
'wishes': 745,
'apple': 746,
'conversation': 747,
'host': 748,
'sign': 749,
'whether': 750,
"they're": 751,
'completely': 752,
'murder': 753,
'fashion': 754,
'adds': 755,
'adorable': 756,
'totally': 757,
'learned': 758,
'signs': 759,
'hoping': 760,
'prince': 761,
'fuck': 762,
'role': 763,
'stage': 764,
'j': 765,
'oil': 766,
'powerful': 767,
'k': 768,
'breaks': 769,
'abuse': 770,
'song': 771,
'across': 772,
'reality': 773,
'moving': 774,
'decision': 775,
'global': 776,
'keeps': 777,
'experience': 778,
'seen': 779,
'syrian': 780,
'announce': 781,
'true': 782,
'risk': 783,
'iraq': 784,
'joe': 785,
'bus': 786,
'begins': 787,
'jr': 788,
'syria': 789,
'hand': 790,
'puts': 791,
'dance': 792,
'13': 793,
'final': 794,
'check': 795,
'cops': 796,
'dinner': 797,
'stand': 798,
'apartment': 799,
'cut': 800,
'almost': 801,
'playing': 802,
'names': 803,
'coffee': 804,
'weird': 805,
'longer': 806,
'hurricane': 807,
'schools': 808,
'kills': 809,
"there's": 810,
'lose': 811,
'amazon': 812,
'awards': 813,
'un': 814,
'low': 815,
'press': 816,
'robert': 817,
'mind': 818,
'number': 819,
'lead': 820,
'west': 821,
'd': 822,
'worth': 823,
'anniversary': 824,
'surprise': 825,
'oscar': 826,
'feeling': 827,
'anyone': 828,
'crash': 829,
'mental': 830,
'trans': 831,
'test': 832,
'lgbt': 833,
'worst': 834,
'band': 835,
'hall': 836,
'space': 837,
'return': 838,
'official': 839,
'which': 840,
'audience': 841,
'queer': 842,
'shop': 843,
'hits': 844,
'demands': 845,
'oscars': 846,
'data': 847,
'workers': 848,
'university': 849,
'iowa': 850,
'side': 851,
'apologizes': 852,
'road': 853,
'cool': 854,
'industry': 855,
'suspect': 856,
'planned': 857,
'plane': 858,
'nfl': 859,
'evidence': 860,
'steve': 861,
'eat': 862,
'program': 863,
'door': 864,
'table': 865,
'governor': 866,
'remember': 867,
'try': 868,
'important': 869,
'chance': 870,
'transgender': 871,
'defense': 872,
'reportedly': 873,
'urges': 874,
'near': 875,
'church': 876,
'chicago': 877,
'halloween': 878,
'users': 879,
'possible': 880,
'general': 881,
'doctor': 882,
'2014': 883,
'hear': 884,
'style': 885,
'spot': 886,
'supporters': 887,
'five': 888,
'rubio': 889,
'reveal': 890,
'kardashian': 891,
'advice': 892,
'coworker': 893,
'given': 894,
'voter': 895,
'brings': 896,
'picture': 897,
'dying': 898,
'success': 899,
'blood': 900,
'executive': 901,
'finding': 902,
'suicide': 903,
'tour': 904,
'biggest': 905,
'2017': 906,
'allegations': 907,
'accidentally': 908,
'easy': 909,
'boys': 910,
'football': 911,
'protesters': 912,
'bathroom': 913,
'fighting': 914,
'demand': 915,
'michelle': 916,
'peace': 917,
'push': 918,
'reform': 919,
'paris': 920,
'close': 921,
'avoid': 922,
'elizabeth': 923,
'israel': 924,
'homeless': 925,
'poor': 926,
'fear': 927,
'cnn': 928,
'quietly': 929,
'reminds': 930,
'teens': 931,
'nyc': 932,
'address': 933,
'learn': 934,
'board': 935,
'bowl': 936,
'uses': 937,
'apparently': 938,
'building': 939,
'act': 940,
'staff': 941,
'rest': 942,
'pro': 943,
'major': 944,
'carolina': 945,
'train': 946,
'force': 947,
'voice': 948,
'pregnant': 949,
'weight': 950,
'officer': 951,
'suggests': 952,
'walking': 953,
'google': 954,
'members': 955,
'rally': 956,
'economy': 957,
'ideas': 958,
'lets': 959,
'died': 960,
'celebrate': 961,
'card': 962,
'website': 963,
'park': 964,
'ben': 965,
'performance': 966,
'prevent': 967,
'williams': 968,
'moore': 969,
'moms': 970,
'18': 971,
'green': 972,
'ferguson': 973,
'saudi': 974,
'asking': 975,
'artist': 976,
'leading': 977,
'pick': 978,
'passes': 979,
'opening': 980,
'said': 981,
'christian': 982,
'album': 983,
'happens': 984,
'netflix': 985,
'culture': 986,
'magazine': 987,
'throws': 988,
'leads': 989,
'eyes': 990,
'humans': 991,
'chicken': 992,
'beauty': 993,
'crime': 994,
'recalls': 995,
'energy': 996,
'voting': 997,
'question': 998,
'episode': 999,
'swift': 1000,
...}
Q6. Create features and labels
= tokenizer.texts_to_sequences(sarcasm_relevant_df['headline'])
X = pad_sequences(X, maxlen = maxlen)
X = np.asarray(sarcasm_relevant_df['is_sarcastic'])
y
print("Number of Feature samples:", len(X))
print(X[0])
print("Number of Labels: ", len(y))
print(y[0])
Number of Feature samples: 28619
[ 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 354 3166 7473 2643 2
660 1118]
Number of Labels: 28619
1
Q7. Get vocabulary size
= len(tokenizer.word_index)
vocabulary_size print (vocabulary_size)
30884
Q8. Create a weight matrix using GloVe embeddings
= "glove.6B.zip"
glove_file
#Extract Glove embedding zip file
with ZipFile(glove_file, 'r') as z:
z.extractall()
= './glove.6B.200d.txt'
EMBEDDING_FILE
= {}
embeddings for o in open(EMBEDDING_FILE):
= o.split(" ")[0]
word # print(word)
= o.split(" ")[1:]
embd = np.asarray(embd, dtype='float32')
embd # print(embd)
= embd embeddings[word]
= np.zeros((vocabulary_size, 200))
embedding_matrix
for word, i in tokenizer.word_index.items():
= embeddings.get(word)
embedding_vector if embedding_vector is not None:
-1] = embedding_vector
embedding_matrix[i
len(embeddings.values())
400000
Q9. Define and compile a Bidirectional LSTM model
### Embedding layer for hint
## model.add(Embedding(num_words, embedding_size, weights = [embedding_matrix]))
### Bidirectional LSTM layer for hint
## model.add(Bidirectional(LSTM(128, return_sequences = True)))
= Input(shape=(maxlen,),dtype=tf.int64)
input_layer = Embedding(embedding_matrix.shape[0],output_dim=200,weights=[embedding_matrix],input_length=maxlen, trainable=True)(input_layer)
embed =Bidirectional(LSTM(128))(embed)
lstm=Dropout(0.3)(lstm)
drop=Dense(100,activation='relu')(drop)
dense =Dense(2,activation='softmax')(dense) out
Q10. Fit the model and check the validation accuracy
= 100
batch_size = 5
epochs
= Model(input_layer,out)
model compile(loss='sparse_categorical_crossentropy',optimizer="adam",metrics=['accuracy'])
model. model.summary()
Model: "model"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
input_1 (InputLayer) [(None, 926)] 0
embedding (Embedding) (None, 926, 200) 6176800
bidirectional (Bidirectiona (None, 256) 336896
l)
dropout_8 (Dropout) (None, 256) 0
dense_16 (Dense) (None, 100) 25700
dense_17 (Dense) (None, 2) 202
=================================================================
Total params: 6,539,598
Trainable params: 6,539,598
Non-trainable params: 0
_________________________________________________________________
True)
tf.config.run_functions_eagerly(= train_test_split(X, y, test_size=0.2, random_state=10)
X_train, X_test, y_train, y_test with tf.device('/device:GPU:0'):
= model.fit(X_train,y_train,batch_size=batch_size, epochs=epochs, verbose=1) res
Epoch 1/5
229/229 [==============================] - 173s 745ms/step - loss: 0.5122 - accuracy: 0.7243
Epoch 2/5
229/229 [==============================] - 235s 1s/step - loss: 0.2859 - accuracy: 0.8794
Epoch 3/5
229/229 [==============================] - 167s 729ms/step - loss: 0.1933 - accuracy: 0.9240
Epoch 4/5
229/229 [==============================] - 165s 719ms/step - loss: 0.1287 - accuracy: 0.9537
Epoch 5/5
229/229 [==============================] - 162s 706ms/step - loss: 0.0852 - accuracy: 0.9716
/Users/kashmkj/micromamba/envs/tensorflow_gl/lib/python3.10/site-packages/tensorflow/python/data/ops/structured_function.py:254: UserWarning: Even though the `tf.config.experimental_run_functions_eagerly` option is set, this option does not apply to tf.data functions. To force eager execution of tf.data functions, please use `tf.data.experimental.enable_debug_mode()`.
warnings.warn(
= model.predict(np.array(X_test), verbose=1) predictions
179/179 [==============================] - 12s 60ms/step
= [0 if i>j else 1 for i,j in predictions] test_pred
print(classification_report(y_test, test_pred))
precision recall f1-score support
0 0.85 0.88 0.86 2977
1 0.86 0.83 0.84 2747
accuracy 0.85 5724
macro avg 0.85 0.85 0.85 5724
weighted avg 0.85 0.85 0.85 5724
We’ve got an accuracy of over 95%. We can further fine-tune the model, by running through more epochs, and adding more hidden layers, with dropout. The project ends here.