The goal of this script is to conduct a brief exploratory analysis of the cleaned data to describe the distributions of the available variables, as well as the nature of the possible association between salary and each of the others, using figures and tables with numerical summaries.
Remarks
The dataset reveals that individuals are on average 37.4 years old with 10 years of experience, earning approximately USD 100k, with salary positively associated with age and experience, higher for males, those with higher education, and those in “Senior” or “Leadership” roles.
Code
import sysimport ossys.path.append(os.path.join(os.getcwd(), "code"))from modulos import create_frequency_tableimport pandas as pdimport plotly.express as pximport plotly.graph_objects as goimport plotly.figure_factory as fffrom plotly.subplots import make_subplotsimport plotly.tools as tlsimport numpy as npimport seaborn as snsimport matplotlib.pyplot as pltfrom IPython.display import display, HTML# Cargar los datosdatos = pd.read_csv('../data/clean_data.csv')
Exploration of each variable individually
The individuals in the dataset are, on average, 37.4 years old, with 10.0 years of work experience and a salary of approximately USD 100k. The gender is evenly distributed. More than half have only achieved a Bachelor’s degree level. About 60% hold a job with some level of hierarchy, as their job titles are related to the words “Senior” or “Leadership”.
Code
# Crear una figura con 1 fila y 3 columnasplt.figure(figsize=(7, 2.5))# Primer gráfico de densidad para 'age'plt.subplot(1, 3, 1) sns.kdeplot(datos['age'], fill=True, color="blue", cut =0)plt.xlabel("Age")plt.ylabel("Density")# plt.grid(True)plt.gca().get_yaxis().set_visible(False)plt.xticks(ticks=(range(20, 60, 10)))# Segundo gráfico de densidad para 'exp'plt.subplot(1, 3, 2) sns.kdeplot(datos['exp'], fill=True, color="blue", cut =0)plt.xlabel("Years of experience")plt.ylabel("Density")# plt.grid(True)plt.gca().get_yaxis().set_visible(False)plt.xticks(ticks=(range(0, 30, 5)))# Tercer gráfico de densidad para 'salary'plt.subplot(1, 3, 3) sns.kdeplot(datos['salary'], fill=True, color="blue", cut =0)plt.xlabel("Salary (USD)")plt.ylabel("Density")# plt.grid(True)plt.gca().get_yaxis().set_visible(False)plt.xticks(ticks=(range(25000, 176000, 75000)))plt.suptitle("Distribution of age, years of experience and salary")plt.show()
Code
# Figura de barras de variables categóricasfig_bar = make_subplots(rows=1, cols=3)# Calcular proporciones y agregar trazas# Diagrama de barras para 'gender'gender_counts = datos['gender'].value_counts(normalize=True) *100fig_bar.add_trace( go.Bar(x=gender_counts.index, y=gender_counts.values, marker_color='blue', name='Gender'), row=1, col=1)# Diagrama de barras para 'educ'educ_counts = datos['educ'].value_counts(normalize=True) *100fig_bar.add_trace( go.Bar(x=educ_counts.index, y=educ_counts.values, marker_color='blue', name='Education'), row=1, col=2)# Diagrama de barras para 'title_cat'title_cat_counts = datos['title_cat'].value_counts(normalize=True) *100fig_bar.add_trace( go.Bar(x=title_cat_counts.index, y=title_cat_counts.values, marker_color='blue', name='Title Category'), row=1, col=3)# Actualizar el diseñofig_bar.update_layout( title_text="Distribution of gender, education and word related to job title", showlegend=False, height=250, width=750, title_font=dict(size=18), yaxis1_title_text="Percentage", xaxis1_title_text ="Gender", xaxis2_title_text ="Education", xaxis3_title_text ="Job title related to", yaxis1_ticksuffix ="%", yaxis2_ticksuffix ="%", yaxis3_ticksuffix ="%")# Mostrar figurafig_bar.show()
# Crear tablas para cada variable categóricatabla_gender = create_frequency_table(datos["gender"], "Gender")tabla_educ = create_frequency_table(datos["educ"], "Education")tabla_title_cat = create_frequency_table(datos["title_cat"], "Job Title related to")
Code
HTML(tabla_gender.to_html(index=False))
Gender
N
Percentage
Male
193
51.74
Female
177
47.45
NaN
3
0.80
Total
373
99.99
Code
HTML(tabla_educ.to_html(index=False))
Education
N
Percentage
Bachelor's
224
60.05
Master's
98
26.27
PhD
51
13.67
Total
373
99.99
Code
HTML(tabla_title_cat.to_html(index=False))
Job Title related to
N
Percentage
Senior
152
40.75
Junior
90
24.13
Leadership
77
20.64
Other
54
14.48
Total
373
100.00
Relationship between salary and the other variables
The salary is strongly and positively associated with both the person’s age and years of experience. The salary distribution appears to be right-skewed for males compared to females. A clear salary increase is observed for higher education levels, as well as for positions linked to “Leadership” or “Senior”.
Code
# Crear la figura con dos paneles (subplots)fig = make_subplots(rows=1, cols=2)# Agregar el gráfico de dispersión para 'salary' vs 'age'fig.add_trace( go.Scatter( x=datos['age'], y=datos['salary'], mode='markers', # Puntos marker=dict(color='blue', opacity=0.5), # Color azul y transparencia ), row=1, col=1)# Agregar el gráfico de dispersión para 'salary' vs 'exp'fig.add_trace( go.Scatter( x=datos['exp'], y=datos['salary'], mode='markers', # Puntos marker=dict(color='blue', opacity=0.5), # Color azul y transparencia ), row=1, col=2)# Actualizar los ejes y la configuración del gráficofig.update_layout( showlegend=False, xaxis_title="Age", yaxis_title="Salary", yaxis2_title="Salary", xaxis2_title="Years of Experience", # template="plotly_white", title_text="Salary vs age and years of experience", width =600, height =250)# Mostrar la figurafig.show()
Code
# Figura con boxplots de salary vs categorical variablesfig_box = make_subplots(rows=3, cols=1)# Boxplot de 'salary' y 'gender'fig_box.add_trace( go.Box(y=datos['gender'], x=datos['salary'], name='Gender', orientation ='h', marker=dict(color='blue')), row=1, col=1)# Boxplot de 'salary' y 'educ'fig_box.add_trace( go.Box(y=datos['educ'], x=datos['salary'], name='Education', orientation ='h', marker=dict(color='blue')), row=2, col=1)# Boxplot de 'salary' y 'title_cat'fig_box.add_trace( go.Box(y=datos['title_cat'], x=datos['salary'], name='Title Category', orientation ='h', marker=dict(color='blue')), row=3, col=1)fig_box.update_layout( title_text="Salary by gender, education and job title", xaxis3_title="Salary", yaxis1_title ="Gender", yaxis2_title ="Education", yaxis3_title ="Job title related to", showlegend=False, width =450, height =450)fig_box.show()