Once I found the script of "Game of Thrones" on kaggle.
And decided to try NLP techniques for Data Analysis. NLP is one of the important areas in Machine Learning. I don't want to go into details, here are some articles on medium about it.
#import libraries
import numpy as np
import pandas as pd
import plotly.express as px ## Visualization
import plotly.graph_objects as go ## Visualization
import matplotlib.pyplot as plt ## Visualization
import plotly as py ## Visualization
from wordcloud import WordCloud, STOPWORDS ## To create word clouds from script
import os
%config IPCompleter.greedy=True
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import re
import gensim
from gensim.models import word2vec
from sklearn.manifold import TSNE
# WORD2VEC
W2V_SIZE = 300
W2V_WINDOW = 7
W2V_EPOCH = 32
W2V_MIN_COUNT = 10
Read dataset
df = pd.read_csv('Game_of_Thrones_Script.csv')
df.head()
Some data preprocessing and cleaning
Change date format
df.loc[:,'Release Date'] = pd.to_datetime(df['Release Date'])
df['Year'] = df['Release Date'].dt.year
df['Month'] = df['Release Date'].dt.month
month_mapper = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', 7:'Jul', 8:'Aug', 9:'Sep', 10:'Oct', 11:'Nov', 12:'Dec'}
df.loc[:,'Month'] = df['Month'].map(month_mapper)
Stop.words are the commonly used words like “the”, “a”, “an”, “in” and stemming algorithm reduces the words like “chocolates” and “choco” into the root word “chocolate”.
stop_words = stopwords.words("english")
stemmer = SnowballStemmer("english")
Text cleaning
TEXT_CLEANING_RE = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"
def preprocess(text, stem=False):
# Remove link,user and special characters
text = re.sub(TEXT_CLEANING_RE, ' ', str(text).lower()).strip()
tokens = []
for token in text.split():
if token not in stop_words:
if stem:
tokens.append(stemmer.stem(token))
else:
tokens.append(token)
return " ".join(tokens)
Apply function
df.Sentence = df.Sentence.apply(lambda x: preprocess(x))
There are occasional characters in the script, I will drop them from dataset.
characters_drop = ['man', 'women', 'boy','girl', 'old man']
df =df[-df['Name'].isin(characters_drop)]
Total dialogues by seasons
temp = df['Season'].value_counts().reset_index()
temp.columns=['Season', 'Counts']
temp.sort_values(by='Season', inplace=True)
fig = px.bar(temp, 'Season', 'Counts')
fig.update_layout(
autosize=False,
width=1000,
height=600,
title={
'text': "Total dialougue counts in season.",
'y':0.9,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'},
bargap=0.2,
bargroupgap=0.1
)
fig.show()
Seasons 2,3,4 are leaders here and remember that in season 8 there were long episodes without any talking.
Let's see the episodes breakdown.
plt.rcParams["figure.figsize"] = (15,10)
temp = df.groupby(['Season','Episode'])['count_words'].sum().unstack().plot(kind='bar', fill = 'count_words',stacked=True)
plt.title("Number of words by Episodes in all Seasons", fontsize=20)
plt.savefig('images/Episode_words.jpg')
plt.show()
Most frequent 20 words
from collections import Counter
words = Counter(" ".join(df["Sentence"]).split()).most_common(20)
names, values = zip(*words)
fig = px.bar(x=names, y=values, labels={'x':'words', 'y':'count'})
fig.update_layout(
autosize=False,
width=1000,
height=600,
title={
'text': "Most frequent 20 words",
'y':0.9,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'},
bargap=0.2,
bargroupgap=0.1
)
fig.show()
A Wordcloud (or Tag cloud) is a visual representation of text data. It displays a list of words, the importance of each beeing shown with font size or color
wordcloud = WordCloud(width = 1000, height = 600, min_font_size=10, background_color ='#add8e6').generate(
' '.join(i for i in df['Sentence']))
plt.figure(figsize = (12, 12), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
plt.savefig('images/Most_common_words_GOT.jpg')
plt.show(
Most frequent 20 words used by tyrion_lannister
tyrion_lannister = df[df['Name']=='tyrion lannister']
words = Counter(" ".join(tyrion_lannister["Sentence"]).split()).most_common(20)
names, values = zip(*words)
fig = px.bar(x=names, y=values, labels={'x':'words', 'y':'count'})
fig.update_layout(
autosize=False,
width=1000,
height=600,
title={
'text': "Most common 20 words of Tyrion Lannister",
'y':0.9,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'},
bargap=0.2,
bargroupgap=0.1
)
fig.show()
Most frequent 20 words used by daenerys_targaryen
daenerys_targaryen = df[df['Name']=='daenerys targaryen']
words = Counter(" ".join(daenerys_targaryen["Sentence"]).split()).most_common(20)
names, values = zip(*words)
fig = px.bar(x=names, y=values, labels={'x':'words', 'y':'count'})
fig.update_layout(
autosize=False,
width=1000,
height=600,
title={
'text': "Most common 20 words of daenerys_targaryen",
'y':0.9,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'},
bargap=0.2,
bargroupgap=0.1
)
fig.show()
20 characters with most dialogues
temp = df['Name'].value_counts().reset_index()
temp.columns=['Character', 'No of Dialouges']
fig = px.bar(temp.head(20), 'Character', 'No of Dialouges')
fig.update_layout(
autosize=False,
width=1000,
height=600,
title={
'text': "Characters with most dialogues",
'y':0.9,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'},
bargap=0.2,
bargroupgap=0.1
)
fig.show()
I want to find the most important words for each of the main characters using tf-idf. Simply, these are the words that can help explain the meaning of sentences. You can read more about tf-idf here
Daenerys Targaryen
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
x = tfidf.fit_transform(daenerys_targaryen.Sentence)
feature_array = np.array(tfidf.get_feature_names())
tfidf_sorting = np.argsort(x.toarray()).flatten()[::-1]
n = 20
top_n = feature_array[tfidf_sorting][:n]
text = ' '.join(top_n)
# Create a cloud image:
wordcloud = WordCloud(width=1600, height=800,min_font_size=10, background_color ='#add8e6').generate(text)
# Display the generated image:
plt.figure(figsize = (12, 6), facecolor = None)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.tight_layout(pad = 0)
plt.savefig('images/important_words_daenerys.jpg')
plt.show()
Tyrion Lannister
tfidf = TfidfVectorizer()
x = tfidf.fit_transform(tyrion_lannister.Sentence)
feature_array = np.array(tfidf.get_feature_names())
tfidf_sorting = np.argsort(x.toarray()).flatten()[::-1]
n = 20
top_n = feature_array[tfidf_sorting][:n]
text = ' '.join(top_n)
# Create a cloud image:
wordcloud = WordCloud(width=1600, height=800,min_font_size=10, background_color ='#add8e6').generate(text)
# Display the generated image:
plt.figure(figsize = (12, 6), facecolor = None)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.savefig('images/important_words_tyrion.jpg')
plt.show()
Jon Snow
jon_snow = df[df['Name']=='jon snow']
tfidf = TfidfVectorizer()
x = tfidf.fit_transform(jon_snow.Sentence)
feature_array = np.array(tfidf.get_feature_names())
tfidf_sorting = np.argsort(x.toarray()).flatten()[::-1]
n = 30
top_n = feature_array[tfidf_sorting][:n]
text = ' '.join(top_n)
# Create a cloud image:
wordcloud = WordCloud(width=1600, height=800,min_font_size=10, background_color ='#add8e6').generate(text)
# Display the generated image:
plt.figure(figsize = (12, 6), facecolor = None)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.tight_layout(pad = 0)
plt.savefig('images/important_words_jon.jpg')
plt.show()
Build a corpus for the word2vec model
def build_corpus(data):
"Creates a list of lists containing words from each sentence"
corpus = []
for sentence in data:
word_list = sentence.split(" ")
corpus.append(word_list)
return corpus
corpus = build_corpus(daenerys_targaryen.Sentence)
Daenerys Targaryen's words TSNE. In this context TSNE shows which words are used together.
model = gensim.models.word2vec.Word2Vec(size=W2V_SIZE,
window=W2V_WINDOW,
min_count=W2V_MIN_COUNT,
workers=4)
model.build_vocab(corpus)
# import the t-SNE library and matplotlib for plotting
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
# define the function to compute the dimensionality reduction
# and then produce the biplot
def tsne_plot(model):
"Creates a TSNE model and plots it"
labels = []
tokens = []
for word in model.wv.vocab:
tokens.append(model[word])
labels.append(word)
tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500)
new_values = tsne_model.fit_transform(tokens)
x = []
y = []
for value in new_values:
x.append(value[0])
y.append(value[1])
plt.figure(figsize=(18, 18))
for i in range(len(x)):
plt.scatter(x[i],y[i])
plt.annotate(labels[i],
xy=(x[i], y[i]),
xytext=(5, 2),
textcoords='offset points',
ha='right',
va='bottom')
plt.savefig('images/tsne_daenerys.jpg')
plt.show()
# call the function on our dataset
tsne_plot(model)
And the last thing I will add is a bar chart race which I made using Flourish Studio. Enjoy!
Comments