top of page

NLP - GOT

Once I found the script of "Game of Thrones" on kaggle.

And decided to try NLP techniques for Data Analysis. NLP is one of the important areas in Machine Learning. I don't want to go into details, here are some articles on medium about it.

#import libraries
import numpy as np 
import pandas as pd
import plotly.express as px ## Visualization
import plotly.graph_objects as go ## Visualization
import matplotlib.pyplot as plt ## Visualization
import plotly as py ## Visualization
from wordcloud import WordCloud, STOPWORDS ## To create word clouds from script
import os
%config IPCompleter.greedy=True
import nltk
from nltk.corpus import stopwords
from  nltk.stem import SnowballStemmer
import re
import gensim
from gensim.models import word2vec
from sklearn.manifold import TSNE

# WORD2VEC 
W2V_SIZE = 300
W2V_WINDOW = 7
W2V_EPOCH = 32
W2V_MIN_COUNT = 10

Read dataset

df = pd.read_csv('Game_of_Thrones_Script.csv')
df.head()

Some data preprocessing and cleaning


Change date format

df.loc[:,'Release Date'] = pd.to_datetime(df['Release Date'])

df['Year'] = df['Release Date'].dt.year
df['Month'] = df['Release Date'].dt.month
month_mapper = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', 7:'Jul', 8:'Aug', 9:'Sep', 10:'Oct', 11:'Nov', 12:'Dec'}
df.loc[:,'Month'] = df['Month'].map(month_mapper)

Stop.words are the commonly used words like “the”, “a”, “an”, “in” and stemming algorithm reduces the words like “chocolates” and “choco” into the root word “chocolate”.

stop_words = stopwords.words("english") 
stemmer = SnowballStemmer("english")

Text cleaning

TEXT_CLEANING_RE = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"
def preprocess(text, stem=False):
    # Remove link,user and special characters
    text = re.sub(TEXT_CLEANING_RE, ' ', str(text).lower()).strip()
    tokens = []
    for token in text.split():
        if token not in stop_words:
            if stem:
                tokens.append(stemmer.stem(token))
            else:
                tokens.append(token)
    return " ".join(tokens)

Apply function

df.Sentence = df.Sentence.apply(lambda x: preprocess(x))

There are occasional characters in the script, I will drop them from dataset.

characters_drop = ['man', 'women', 'boy','girl', 'old man']
df =df[-df['Name'].isin(characters_drop)] 

Total dialogues by seasons

temp = df['Season'].value_counts().reset_index()
temp.columns=['Season', 'Counts']
temp.sort_values(by='Season', inplace=True)
fig = px.bar(temp, 'Season', 'Counts')
fig.update_layout(
    autosize=False,
    width=1000,
    height=600,
    title={
        
        'text': "Total dialougue counts in season.",
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
    bargap=0.2, 
    bargroupgap=0.1 
)
fig.show()

Seasons 2,3,4 are leaders here and remember that in season 8 there were long episodes without any talking.


Let's see the episodes breakdown.

plt.rcParams["figure.figsize"] = (15,10)
temp = df.groupby(['Season','Episode'])['count_words'].sum().unstack().plot(kind='bar', fill = 'count_words',stacked=True)
plt.title("Number of words by Episodes in all Seasons", fontsize=20)
plt.savefig('images/Episode_words.jpg')
plt.show()

Most frequent 20 words

from collections import Counter
words = Counter(" ".join(df["Sentence"]).split()).most_common(20)

names, values = zip(*words)
fig = px.bar(x=names, y=values, labels={'x':'words', 'y':'count'})
fig.update_layout(
    autosize=False,
    width=1000,
    height=600,
    title={
        
        'text': "Most frequent 20 words",
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
    bargap=0.2,
    bargroupgap=0.1
)
fig.show()

A Wordcloud (or Tag cloud) is a visual representation of text data. It displays a list of words, the importance of each beeing shown with font size or color

wordcloud = WordCloud(width = 1000, height = 600, min_font_size=10, background_color ='#add8e6').generate(
    ' '.join(i for i in df['Sentence']))
plt.figure(figsize = (12, 12), facecolor = None) 
plt.imshow(wordcloud) 
plt.axis("off") 
plt.tight_layout(pad = 0)
plt.savefig('images/Most_common_words_GOT.jpg')
plt.show(

Most frequent 20 words used by tyrion_lannister

tyrion_lannister = df[df['Name']=='tyrion lannister']
words = Counter(" ".join(tyrion_lannister["Sentence"]).split()).most_common(20)

names, values = zip(*words)
fig = px.bar(x=names, y=values, labels={'x':'words', 'y':'count'})
fig.update_layout(
    autosize=False,
    width=1000,
    height=600,
    title={
        
        'text': "Most common 20 words of Tyrion Lannister",
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
    bargap=0.2, 
    bargroupgap=0.1
)
fig.show()

Most frequent 20 words used by daenerys_targaryen

daenerys_targaryen = df[df['Name']=='daenerys targaryen']
words = Counter(" ".join(daenerys_targaryen["Sentence"]).split()).most_common(20)
names, values = zip(*words)
fig = px.bar(x=names, y=values, labels={'x':'words', 'y':'count'})
fig.update_layout(
    autosize=False,
    width=1000,
    height=600,
    title={
        
        'text': "Most common 20 words of daenerys_targaryen",
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
    bargap=0.2, 
    bargroupgap=0.1 
)
fig.show()

20 characters with most dialogues

temp = df['Name'].value_counts().reset_index()
temp.columns=['Character', 'No of Dialouges']
fig = px.bar(temp.head(20), 'Character', 'No of Dialouges')
fig.update_layout(
    autosize=False,
    width=1000,
    height=600,
    title={
        
        'text': "Characters with most dialogues",
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
    bargap=0.2, 
    bargroupgap=0.1 
)
fig.show()

I want to find the most important words for each of the main characters using tf-idf. Simply, these are the words that can help explain the meaning of sentences. You can read more about tf-idf here


Daenerys Targaryen

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
x = tfidf.fit_transform(daenerys_targaryen.Sentence)

feature_array = np.array(tfidf.get_feature_names())
tfidf_sorting = np.argsort(x.toarray()).flatten()[::-1]

n = 20
top_n = feature_array[tfidf_sorting][:n]

text = ' '.join(top_n)

# Create a cloud image:
wordcloud = WordCloud(width=1600, height=800,min_font_size=10, background_color ='#add8e6').generate(text)

# Display the generated image:
plt.figure(figsize = (12, 6), facecolor = None) 
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.tight_layout(pad = 0)
plt.savefig('images/important_words_daenerys.jpg')
plt.show()

Tyrion Lannister

tfidf = TfidfVectorizer()
x = tfidf.fit_transform(tyrion_lannister.Sentence)
feature_array = np.array(tfidf.get_feature_names())
tfidf_sorting = np.argsort(x.toarray()).flatten()[::-1]
n = 20
top_n = feature_array[tfidf_sorting][:n]
text = ' '.join(top_n)

# Create a cloud image:
wordcloud = WordCloud(width=1600, height=800,min_font_size=10, background_color ='#add8e6').generate(text)

# Display the generated image:
plt.figure(figsize = (12, 6), facecolor = None) 
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.savefig('images/important_words_tyrion.jpg')
plt.show()



Jon Snow

jon_snow = df[df['Name']=='jon snow']

tfidf = TfidfVectorizer()
x = tfidf.fit_transform(jon_snow.Sentence)

feature_array = np.array(tfidf.get_feature_names())
tfidf_sorting = np.argsort(x.toarray()).flatten()[::-1]

n = 30
top_n = feature_array[tfidf_sorting][:n]
text = ' '.join(top_n)

# Create a cloud image:
wordcloud = WordCloud(width=1600, height=800,min_font_size=10, background_color ='#add8e6').generate(text)

# Display the generated image:
plt.figure(figsize = (12, 6), facecolor = None) 
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.tight_layout(pad = 0)
plt.savefig('images/important_words_jon.jpg')
plt.show()

Build a corpus for the word2vec model

def build_corpus(data):
    "Creates a list of lists containing words from each sentence"
    corpus = []
    for sentence in data:
        word_list = sentence.split(" ")
        corpus.append(word_list)    
           
    return corpus
corpus = build_corpus(daenerys_targaryen.Sentence)

Daenerys Targaryen's words TSNE. In this context TSNE shows which words are used together.

model = gensim.models.word2vec.Word2Vec(size=W2V_SIZE, 
                                            window=W2V_WINDOW, 
                                            min_count=W2V_MIN_COUNT, 
                                            workers=4)
model.build_vocab(corpus)

# import the t-SNE library and matplotlib for plotting
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

# define the function to compute the dimensionality reduction
# and then produce the biplot
def tsne_plot(model):
    "Creates a TSNE model and plots it"
    labels = []
    tokens = []

    for word in model.wv.vocab:
        tokens.append(model[word])
        labels.append(word)
    
    tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500)
    new_values = tsne_model.fit_transform(tokens)

    x = []
    y = []
    for value in new_values:
        x.append(value[0])
        y.append(value[1])
        
    plt.figure(figsize=(18, 18)) 
    for i in range(len(x)):
        plt.scatter(x[i],y[i])
        plt.annotate(labels[i],
                     xy=(x[i], y[i]),
                     xytext=(5, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom')
    plt.savefig('images/tsne_daenerys.jpg')
    plt.show()
    
# call the function on our dataset
tsne_plot(model)

And the last thing I will add is a bar chart race which I made using Flourish Studio. Enjoy!


Comments


bottom of page