Analyzing a world population dataset in python

In this article, we look at a world population dataset and we will analyze and show a couple of python examples that we can use with this dataset

You can download the dataset from the link at the bottom of the article, here are the columns

Rank: Rank by Population
CCA3: 3 Digit Country/Territory Code
Country/Territory: Name of the Country/Territory
Capital: Name of the Capital
Continent: Name of the Continent
2022 Population: Population of the Country/Territories in the year 2022
2020 Population: Population of the Country/Territories in the year 2020
2015 Population: Population of the Country/Territories in the year 2015
2010 Population: Population of the Country/Territories in the year 2010
2000 Population: Population of the Country/Territories in the year 2000
1990 Population: Population of the Country/Territories in the year 1990
1980 Population: Population of the Country/Territories in the year 1980
1970 Population: Population of the Country/Territories in the year 1970
Area (km²): Area size of the Country/Territories in square kilometers
Density (per km²): Population Density per square kilometer
Growth Rate: Population Growth Rate by Country/Territories
World Population Percentage: The population percentage by each Country/Territories

As you can see there are a lot of things we can do with a dataset like top ten most populous countries, you can do that by continent

Table of Contents

Code Examples

Let’s do some basic analysis and load our dataset

We will check for missing values or duplicates

# Checking if there any missing values are in the data
print(df.isnull().sum())
# checking the duplicates
print(df.duplicated().sum())

This is a very good dataset and there were no missing values or duplicates

# imports
import pandas as pd
import numpy as np

# Data vislization using plolty graph object(go)
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.offline import iplot
import plotly.io as pio

# For showing plotly plots on notebook
import plotly.offline as py
from plotly.offline import init_notebook_mode
#py.init_notebook_mode()

df = pd.read_csv('world_population.csv')
#data analysis
print(df.head())
print(df.shape)
# Checking if there any missing values are in the data
print(df.isnull().sum())
# checking the duplicates
print(df.duplicated().sum())

Top countries by population

Now lets look at the highest countries by population numbers in 2022

# imports
import pandas as pd
import numpy as np

# Data vislization using plolty graph object(go)
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.offline import iplot
import plotly.io as pio

# For showing plotly plots on notebook
import plotly.offline as py
from plotly.offline import init_notebook_mode
#py.init_notebook_mode()

df = pd.read_csv('world_population.csv')
#data analysis
print(df.head())
print(df.shape)
# Checking if there any missing values are in the data
print(df.isnull().sum())
# checking the duplicates
print(df.duplicated().sum())
colors = ["#1d7874","#679289","#f4c095","#ee2e31","#ffb563","#918450","#f85e00","#a41623","#9a031e","#d6d6d6","#ffee32","#ffd100","#333533","#202020"]

top_pop = df.sort_values(by = '2022 Population', ascending = False).head(10)
print(top_pop[['Country/Territory', '2022 Population']])

data = go.Bar(x = top_pop['Country/Territory'], y = top_pop['2022 Population'], text = top_pop['2022 Population'],textposition ='outside',
              textfont = dict(size = 30,
                             color = 'black'),
              marker = dict(color = colors,
                            opacity = 0.7,
                            line_color = 'black',
                            line_width = 2))
layout = go.Layout(title = {'text': "<b>Top 10 Countries with highest population</b>",
                           'x':0.5,
                           'xanchor': 'center'},
                   xaxis = dict(title='Countries' ),
                   yaxis =dict(title='Populations'),
                   width = 900,
                   height = 600,
                   template = 'plotly_white')
fig=go.Figure(data = data, layout = layout)

iplot(fig)

Smallest countries by population

Now lets look at the smallest countries by population numbers in 2022

lowest_pop = df.sort_values(by = ‘2022 Population’, ascending = True).head(10)

# imports
import pandas as pd
import numpy as np

# Data vislization using plolty graph object(go)
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.offline import iplot
import plotly.io as pio

# For showing plotly plots on notebook
import plotly.offline as py
from plotly.offline import init_notebook_mode
#py.init_notebook_mode()

df = pd.read_csv('world_population.csv')
#data analysis
print(df.head())
print(df.shape)
# Checking if there any missing values are in the data
print(df.isnull().sum())
# checking the duplicates
print(df.duplicated().sum())
colors = ["#1d7874","#679289","#f4c095","#ee2e31","#ffb563","#918450","#f85e00","#a41623","#9a031e","#d6d6d6","#ffee32","#ffd100","#333533","#202020"]

lowest_pop = df.sort_values(by = '2022 Population', ascending = True).head(10)
print(lowest_pop[['Country/Territory', '2022 Population']])

data = go.Bar(x = lowest_pop['Country/Territory'], y = lowest_pop['2022 Population'],text = lowest_pop['2022 Population'],textposition ='outside',
              textfont = dict(size = 10,
                             color = 'black'),
             marker = dict(color = colors, opacity = 0.7, line_color = 'black', line_width = 2))
layout = go.Layout(title = {'text' : '<b>Top 10 Countries with the lowest population</b>', 'x' : 0.5},
                   xaxis = dict(title = '<b>Countries</b>'),
                   yaxis = dict(title = '<b>Population</b>'),
                   width = 900,
                   height = 700,
                   template = 'plotly_white')
lowfig = go.Figure(data = data, layout = layout)
lowfig.update_xaxes(tickangle=90,tickfont_size = 12)
iplot(lowfig)

largest European countries by population

# imports
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Data vislization using plolty graph object(go)
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.offline import iplot
import plotly.io as pio
import plotly.express as px

# For showing plotly plots on notebook
import plotly.offline as py
from plotly.offline import init_notebook_mode
#py.init_notebook_mode()

df = pd.read_csv('world_population.csv')
#data analysis
print(df.head())
print(df.shape)
# Checking if there any missing values are in the data
print(df.isnull().sum())
# checking the duplicates
print(df.duplicated().sum())
colors = ["#1d7874","#679289","#f4c095","#ee2e31","#ffb563","#918450","#f85e00","#a41623","#9a031e","#d6d6d6","#ffee32","#ffd100","#333533","#202020"]



sns.set(rc={"axes.facecolor":"#F2EAC5","figure.facecolor":"#F2EAC5"})
plt.subplots(figsize=(20, 10))
p=sns.barplot(data=df[df["Continent"]=="Europe"],y="Country/Territory", x="2022 Population",order=df[df["Continent"]=="Europe"].sort_values("2022 Population",ascending=False)["Country/Territory"][:11],palette=colors[0:11:2], saturation=1,edgecolor = "#1c1c1c", linewidth = 4)
p.axes.set_title("\nEuropean Population 2022\n",fontsize=25)
p.axes.set_xlabel("Population",fontsize=20)
p.axes.set_ylabel("\nCountry",fontsize=20)
p.axes.set_xticklabels(p.get_xticklabels(),rotation = 90)
for container in p.containers:
    p.bar_label(container,label_type="edge",padding=6,size=25,color="black",rotation=0,
    bbox={"boxstyle": "round", "pad": 0.4, "facecolor": "orange", "edgecolor": "#1c1c1c", "linewidth" : 2, "alpha": 1})

sns.despine(left=True, bottom=True)
plt.show()

Continents by percentage

# imports
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Data vislization using plolty graph object(go)
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.offline import iplot
import plotly.io as pio
import plotly.express as px

# For showing plotly plots on notebook
import plotly.offline as py
from plotly.offline import init_notebook_mode
#py.init_notebook_mode()

df = pd.read_csv('world_population.csv')
#data analysis
print(df.head())
print(df.shape)
# Checking if there any missing values are in the data
print(df.isnull().sum())
# checking the duplicates
print(df.duplicated().sum())
colors = ["#1d7874","#679289","#f4c095","#ee2e31","#ffb563","#918450","#f85e00","#a41623","#9a031e","#d6d6d6","#ffee32","#ffd100","#333533","#202020"]

cont_pop = df.groupby('Continent',)[['World Population Percentage']].sum().sort_values(by = 'World Population Percentage', ascending = False)
cont_pop
fig = go.Figure(data = go.Pie(labels = cont_pop.index, values = cont_pop['World Population Percentage'].values))
fig.update_traces(hoverinfo='label',
                  hole = 0.4,
                  textfont_size = 18,
                  textposition ='auto',
                  marker=dict(colors = colors,
                              line = dict(color = 'white',
                                          width = 2)))
fig.update_layout(title ={'text' : '<b>Continent Population Percentage</b>', 
                          'x' : 0.21},
                          template = 'xgridoff',
                          width = 900, height = 600,
                     legend=dict(
                        title_font_family="Times New Roman",
                        font=dict(
                        family="Courier",
                        size=20,
                        color="black" 
                        ),
                        bgcolor="white",
                        bordercolor="Black",
                        borderwidth=2.5)
                 )
iplot(fig)

Links

If you want the dataset and the code examples they are available from

https://github.com/programmershelp/maxpython/tree/main/Data%20Analysis/worldpopulation

Python