Exploratory analysis

Summary

Goal

The goal of this script is to conduct a brief exploratory analysis of the cleaned data to describe the distributions of the available variables, as well as the nature of the possible association between salary and each of the others, using figures and tables with numerical summaries.

Remarks

The dataset reveals that individuals are on average 37.4 years old with 10 years of experience, earning approximately USD 100k, with salary positively associated with age and experience, higher for males, those with higher education, and those in “Senior” or “Leadership” roles.

Code

import sys
import os
sys.path.append(os.path.join(os.getcwd(), "code"))
from modulos import create_frequency_table
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
import plotly.tools as tls
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display, HTML

# Cargar los datos
datos = pd.read_csv('../data/clean_data.csv')

Exploration of each variable individually

The individuals in the dataset are, on average, 37.4 years old, with 10.0 years of work experience and a salary of approximately USD 100k. The gender is evenly distributed. More than half have only achieved a Bachelor’s degree level. About 60% hold a job with some level of hierarchy, as their job titles are related to the words “Senior” or “Leadership”.

Code

# Crear una figura con 1 fila y 3 columnas
plt.figure(figsize=(7, 2.5))

# Primer gráfico de densidad para 'age'
plt.subplot(1, 3, 1)  
sns.kdeplot(datos['age'], fill=True, color="blue", cut = 0)
plt.xlabel("Age")
plt.ylabel("Density")
# plt.grid(True)
plt.gca().get_yaxis().set_visible(False)
plt.xticks(ticks=(range(20, 60, 10)))

# Segundo gráfico de densidad para 'exp'
plt.subplot(1, 3, 2) 
sns.kdeplot(datos['exp'], fill=True, color="blue", cut = 0)
plt.xlabel("Years of experience")
plt.ylabel("Density")
# plt.grid(True)
plt.gca().get_yaxis().set_visible(False)
plt.xticks(ticks=(range(0, 30, 5)))

# Tercer gráfico de densidad para 'salary'
plt.subplot(1, 3, 3)  
sns.kdeplot(datos['salary'], fill=True, color="blue", cut = 0)
plt.xlabel("Salary (USD)")
plt.ylabel("Density")
# plt.grid(True)
plt.gca().get_yaxis().set_visible(False)
plt.xticks(ticks=(range(25000, 176000, 75000)))

plt.suptitle("Distribution of age, years of experience and salary")
plt.show()

Code

# Figura de barras de variables categóricas
fig_bar = make_subplots(rows=1, cols=3)

# Calcular proporciones y agregar trazas

# Diagrama de barras para 'gender'
gender_counts = datos['gender'].value_counts(normalize=True) * 100
fig_bar.add_trace(
    go.Bar(x=gender_counts.index, y=gender_counts.values, marker_color='blue', name='Gender'),
    row=1, col=1
)

# Diagrama de barras para 'educ'
educ_counts = datos['educ'].value_counts(normalize=True) * 100
fig_bar.add_trace(
    go.Bar(x=educ_counts.index, y=educ_counts.values, marker_color='blue', name='Education'),
    row=1, col=2
)

# Diagrama de barras para 'title_cat'
title_cat_counts = datos['title_cat'].value_counts(normalize=True) * 100
fig_bar.add_trace(
    go.Bar(x=title_cat_counts.index, y=title_cat_counts.values, marker_color='blue', name='Title Category'),
    row=1, col=3
)

# Actualizar el diseño
fig_bar.update_layout(
    title_text="Distribution of gender, education and word related to job title",
    showlegend=False,
    height=250,
    width=750,
    title_font=dict(size=18),
    yaxis1_title_text="Percentage",
    xaxis1_title_text = "Gender",
    xaxis2_title_text = "Education",
    xaxis3_title_text = "Job title related to",
    yaxis1_ticksuffix = "%",
    yaxis2_ticksuffix = "%",
    yaxis3_ticksuffix = "%"
)

# Mostrar figura
fig_bar.show()

Statistics for quantitative variables

Code

numeric_stats = datos[['age', 'exp', 'salary']].describe().transpose()
numeric_stats['IQR'] = numeric_stats['75%'] - numeric_stats['25%']
numeric_stats = numeric_stats[['min', 'max', 'mean', '50%', 'std', 'IQR']]
numeric_stats.columns = ['Min', 'Max', 'Mean', 'Median', 'St. Dev.', 'IQR']

# Mostrar el cuadro con estadísticas
numeric_stats.style.format(precision=2)

	Min	Max	Mean	Median	St. Dev.	IQR
age	23.00	53.00	37.41	36.00	7.07	13.00
exp	0.00	25.00	10.03	9.00	6.56	11.00
salary	30000.00	250000.00	100670.24	95000.00	48079.58	85000.00

Frequency tables for categorical variables

Code

# Crear tablas para cada variable categórica
tabla_gender = create_frequency_table(datos["gender"], "Gender")
tabla_educ = create_frequency_table(datos["educ"], "Education")
tabla_title_cat = create_frequency_table(datos["title_cat"], "Job Title related to")

Code

HTML(tabla_gender.to_html(index=False))

Gender	N	Percentage
Male	193	51.74
Female	177	47.45
NaN	3	0.80
Total	373	99.99

Code

HTML(tabla_educ.to_html(index=False))

Education	N	Percentage
Bachelor's	224	60.05
Master's	98	26.27
PhD	51	13.67
Total	373	99.99

Code

HTML(tabla_title_cat.to_html(index=False))

Job Title related to	N	Percentage
Senior	152	40.75
Junior	90	24.13
Leadership	77	20.64
Other	54	14.48
Total	373	100.00

Relationship between salary and the other variables

The salary is strongly and positively associated with both the person’s age and years of experience. The salary distribution appears to be right-skewed for males compared to females. A clear salary increase is observed for higher education levels, as well as for positions linked to “Leadership” or “Senior”.

Code

# Crear la figura con dos paneles (subplots)
fig = make_subplots(rows=1, cols=2)

# Agregar el gráfico de dispersión para 'salary' vs 'age'
fig.add_trace(
    go.Scatter(
        x=datos['age'],
        y=datos['salary'],
        mode='markers',  # Puntos
        marker=dict(color='blue', opacity=0.5),  # Color azul y transparencia
    ),
    row=1, col=1
)

# Agregar el gráfico de dispersión para 'salary' vs 'exp'
fig.add_trace(
    go.Scatter(
        x=datos['exp'],
        y=datos['salary'],
        mode='markers',  # Puntos
        marker=dict(color='blue', opacity=0.5),  # Color azul y transparencia
    ),
    row=1, col=2
)

# Actualizar los ejes y la configuración del gráfico
fig.update_layout(
    showlegend=False,  
    xaxis_title="Age", 
    yaxis_title="Salary", 
    yaxis2_title="Salary", 
    xaxis2_title="Years of Experience", 
    # template="plotly_white",
    title_text="Salary vs age and years of experience",
    width = 600,
    height = 250
)

# Mostrar la figura
fig.show()

Code

# Figura con boxplots de salary vs categorical variables
fig_box = make_subplots(rows=3, cols=1)

# Boxplot de 'salary' y 'gender'
fig_box.add_trace(
    go.Box(y=datos['gender'], x=datos['salary'], name='Gender', orientation = 'h', marker=dict(color='blue')),
    row=1, col=1
)

# Boxplot de 'salary' y 'educ'
fig_box.add_trace(
    go.Box(y=datos['educ'], x=datos['salary'], name='Education',  orientation = 'h', marker=dict(color='blue')),
    row=2, col=1
)

# Boxplot de 'salary' y 'title_cat'
fig_box.add_trace(
    go.Box(y=datos['title_cat'], x=datos['salary'], name='Title Category', orientation = 'h', marker=dict(color='blue')),
    row=3, col=1
)

fig_box.update_layout(
    title_text="Salary by gender, education and job title",
    xaxis3_title="Salary",
    yaxis1_title = "Gender",
    yaxis2_title = "Education",
    yaxis3_title = "Job title related to",
    showlegend=False,
    width = 450,
    height = 450
)
fig_box.show()