Visual Analytics project to analyze and discovery insights of video game sales in recent years, with the high-level API plotly.express.
# Load the Pandas libraries
import pandas as pd
# Load Plot libraries
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
The first step is to load the dataset into a pandas dataframe. Here you can see the dataset.
dataURL = "../data/vgsales_2020.csv"
raw_data = pd.read_csv(dataURL)
raw_data
Rank | Name | Platform | Year | Genre | Publisher | Developer | Global_Sales | |
---|---|---|---|---|---|---|---|---|
0 | 1 | Wii Sports | Wii | 2006 | Sports | Nintendo | NaN | 82.74 |
1 | 2 | Super Mario Bros. | NES | 1985 | Platform | Nintendo | NaN | 40.24 |
2 | 3 | Mario Kart Wii | Wii | 2008 | Racing | Nintendo | NaN | 35.82 |
3 | 4 | Wii Sports Resort | Wii | 2009 | Sports | Nintendo | NaN | 33.00 |
4 | 5 | Pokemon Red/Pokemon Blue | GB | 1996 | Role-Playing | Nintendo | NaN | 31.37 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
17806 | 17807 | This is the Police | NS | 2017 | Simulation | THQ Nordic | Weappy Studio | 0.00 |
17807 | 17808 | Starry Sky: Summer Stories | PSV | 2017 | Adventure | honeybee | honeybee | 0.00 |
17808 | 17809 | Kingdom Hearts HD I.5 + II.5 ReMIX | PS4 | 2017 | Action | Square Enix | Square Enix | 0.00 |
17809 | 17810 | DOOM VFR | PC | 2017 | Shooter | Bethesda Softworks | id Software | 0.00 |
17810 | 17811 | Marvel vs. Capcom: Infinite | PC | 2017 | Fighting | Capcom | Capcom | 0.00 |
17811 rows × 8 columns
Now the basic statistics of the numeric fields are shown, to have a quick understanding of the behavior of the data.
raw_data.describe()
Rank | Year | Global_Sales | |
---|---|---|---|
count | 17811.000000 | 17811.000000 | 17811.000000 |
mean | 8906.000000 | 2007.319409 | 0.518108 |
std | 5141.737158 | 6.370235 | 1.522937 |
min | 1.000000 | 1980.000000 | 0.000000 |
25% | 4453.500000 | 2004.000000 | 0.050000 |
50% | 8906.000000 | 2008.000000 | 0.160000 |
75% | 13358.500000 | 2011.000000 | 0.450000 |
max | 17811.000000 | 2020.000000 | 82.740000 |
Important Note: Sales will be grouped by release year of game, not by real sale date as we don't have such historical data.
# Total sales
gd_sales = raw_data.groupby(["Year"]).sum()
gd_sales.reset_index(inplace=True)
# Plot global trend
fig = px.line(gd_sales, x="Year", y="Global_Sales")
fig.add_shape(dict(type="line", x0=2008, y0=0, x1=2008, y1=700, line=dict(color="RoyalBlue", width=2, dash="dot")))
fig.update_layout(height=400)
fig.update_xaxes(title_text="Release Year")
fig.update_yaxes(title_text="# Global Sales")
fig.show()
Now we can plot the top 50 best-selling video games in the world.
# Data
top_games = 50
raw_data.head(10)
Rank | Name | Platform | Year | Genre | Publisher | Developer | Global_Sales | |
---|---|---|---|---|---|---|---|---|
0 | 1 | Wii Sports | Wii | 2006 | Sports | Nintendo | NaN | 82.74 |
1 | 2 | Super Mario Bros. | NES | 1985 | Platform | Nintendo | NaN | 40.24 |
2 | 3 | Mario Kart Wii | Wii | 2008 | Racing | Nintendo | NaN | 35.82 |
3 | 4 | Wii Sports Resort | Wii | 2009 | Sports | Nintendo | NaN | 33.00 |
4 | 5 | Pokemon Red/Pokemon Blue | GB | 1996 | Role-Playing | Nintendo | NaN | 31.37 |
5 | 6 | Tetris | GB | 1989 | Puzzle | Nintendo | NaN | 30.26 |
6 | 7 | New Super Mario Bros. | DS | 2006 | Platform | Nintendo | NaN | 30.01 |
7 | 8 | Wii Play | Wii | 2006 | Misc | Nintendo | NaN | 29.02 |
8 | 9 | New Super Mario Bros. Wii | Wii | 2009 | Platform | Nintendo | NaN | 28.62 |
9 | 10 | Duck Hunt | NES | 1984 | Shooter | Nintendo | NaN | 28.31 |
# Plot the best-selling video games, colored by Publisher
fig = px.bar(raw_data.head(top_games), x = 'Global_Sales', y = 'Name', color='Publisher',
orientation='h', hover_data=["Platform"])
fig.update_layout(yaxis={'categoryorder':'total ascending'}, showlegend=True)
fig.update_layout(height=800, title_text="Top 50 Best-Selling Video Games")
fig.update_xaxes(title_text="# Global Sales")
fig.update_yaxes(title_text="")
fig.show()
# Grouped data
gd = raw_data.groupby(['Platform', 'Publisher']).agg({'Global_Sales':'sum'})
gd.reset_index(inplace=True)
gd
Platform | Publisher | Global_Sales | |
---|---|---|---|
0 | 2600 | 20th Century Fox Video Games | 1.94 |
1 | 2600 | Activision | 18.38 |
2 | 2600 | Answer Software | 0.50 |
3 | 2600 | Atari | 41.30 |
4 | 2600 | Avalon Interactive | 0.17 |
... | ... | ... | ... |
2072 | XOne | Ubisoft | 27.58 |
2073 | XOne | Unknown | 0.03 |
2074 | XOne | Villa Gorilla | 0.02 |
2075 | XOne | Warner Bros. Interactive Entertainment | 14.67 |
2076 | XOne | Wired Productions | 0.02 |
2077 rows × 3 columns
# Plot Video Game Sales grouped by Platform
fig = px.treemap(gd, path=['Platform', 'Publisher'], values='Global_Sales')
fig.show()
# Plot Video Game Sales grouped by Publisher
fig = px.treemap(gd, path=['Publisher', 'Platform'], values='Global_Sales')
fig.show()
Important Note: Sales will be grouped by release year of game, not by real sale date as we don't have such historical data.
top_companies = 10
# Top 10 Companies
gd = raw_data.groupby(['Publisher']).agg({'Global_Sales':'sum'})
gd = gd.sort_values(by='Global_Sales', ascending=False)
top_companies = list(gd.head(top_companies).index)
top_companies
['Nintendo', 'Electronic Arts', 'Activision', 'Sony Computer Entertainment', 'Ubisoft', 'Take-Two Interactive', 'THQ', 'Konami Digital Entertainment', 'Sega', 'Namco Bandai Games']
# Grouped data
gd = raw_data[raw_data["Publisher"].isin(top_companies)].groupby(['Year', 'Publisher']).agg({'Global_Sales':'sum'})
gd = gd.sort_values(by='Year', ascending=True)
gd.reset_index(inplace=True)
gd
Year | Publisher | Global_Sales | |
---|---|---|---|
0 | 1980 | Activision | 3.02 |
1 | 1981 | Activision | 8.50 |
2 | 1982 | Activision | 1.86 |
3 | 1982 | Sega | 0.40 |
4 | 1983 | Activision | 1.94 |
... | ... | ... | ... |
281 | 2020 | Sony Computer Entertainment | 0.26 |
282 | 2020 | THQ | 0.01 |
283 | 2020 | Activision | 0.01 |
284 | 2020 | Namco Bandai Games | 0.01 |
285 | 2020 | Ubisoft | 0.29 |
286 rows × 3 columns
Plotting Sales Trends of Top 10 Publishers
fig = px.line(gd, x="Year", y="Global_Sales", color='Publisher')
fig.update_layout(title_text="Sales Trends of Top 10 Publishers")
fig.update_xaxes(title_text="Release Year")
fig.update_yaxes(title_text="# Global Sales")
fig.show()
This multi-line chart confirms the insights obtained in point 4.
Regarding Platform and Genre from 2013.
# Parallel Categories Diagram
fig = px.parallel_categories(raw_data.query("Year >= 2013"), dimensions=["Platform", "Genre"])
fig.show()
# Top 10 Companies
gd = raw_data.groupby(['Year', 'Genre']).agg({'Global_Sales':'sum'})
gd.reset_index(inplace=True)
gd
Year | Genre | Global_Sales | |
---|---|---|---|
0 | 1980 | Action | 0.34 |
1 | 1980 | Fighting | 0.77 |
2 | 1980 | Misc | 2.71 |
3 | 1980 | Shooter | 7.07 |
4 | 1980 | Sports | 0.49 |
... | ... | ... | ... |
434 | 2020 | Racing | 0.11 |
435 | 2020 | Role-Playing | 1.64 |
436 | 2020 | Simulation | 0.29 |
437 | 2020 | Sports | 0.03 |
438 | 2020 | Strategy | 0.15 |
439 rows × 3 columns
# Parallel Categories Diagram
fig = px.area(raw_data, x='Year', y='Global_Sales', color='Genre')
fig.update_layout(title_text="Evolution of Video Game Sales by Genre")
fig.update_xaxes(title_text="Release Year")
fig.update_yaxes(title_text="# Global Sales")
fig.show()
# Cook the data
gd_1980s = raw_data.query("Year>=1980 and Year<1990")[["Genre", "Global_Sales"]].groupby(['Genre']).sum()
gd_1980s.reset_index(inplace=True)
gd_1990s = raw_data.query("Year>=1900 and Year<2000")[["Genre", "Global_Sales"]].groupby(['Genre']).sum()
gd_1990s.reset_index(inplace=True)
gd_2000s = raw_data.query("Year>=2000 and Year<2010")[["Genre", "Global_Sales"]].groupby(['Genre']).sum()
gd_2000s.reset_index(inplace=True)
gd_2010s = raw_data.query("Year>=2010")[["Genre", "Global_Sales"]].groupby(['Genre']).sum()
gd_2010s.reset_index(inplace=True)
# Create subplots: use 'domain' type for Pie subplot
labels = ["US", "China", "European Union", "Russian Federation", "Brazil", "India", "Rest of World"]
fig = make_subplots(rows=2, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}], [{'type':'domain'}, {'type':'domain'}]])
fig.add_trace(go.Pie(labels=gd_1980s["Genre"], values=gd_1980s["Global_Sales"], name="1980s", title="1980s", hole=.3), 1, 1)
fig.add_trace(go.Pie(labels=gd_1990s["Genre"], values=gd_1990s["Global_Sales"], name="1990s", title="1990s", hole=.3), 1, 2)
fig.add_trace(go.Pie(labels=gd_2000s["Genre"], values=gd_2000s["Global_Sales"], name="2000s", title="2000s", hole=.3), 2, 1)
fig.add_trace(go.Pie(labels=gd_2010s["Genre"], values=gd_2010s["Global_Sales"], name="2010s", title="2010s", hole=.3), 2, 2)
fig.update_layout(height=800, title_text="Video Game Sales by Genre by Decades")
fig.show()
Finally, how the video game genre trend has changed in each decade: