import sys
import os
"code"))
sys.path.append(os.path.join(os.getcwd(), from modulos import explore_missing
import pandas as pd
import matplotlib.pyplot as plt
Data cleansing and transformation
Summary
Goal
The goal of this script is to perform an initial inspection of the data, addressing missing values, inconsistencies in observations, reducing the number of levels in categorical variables, etc. The output will be a cleaned dataset to be used in subsequent stages of the analysis.
Remarks
Validation
- Cases with missings in the response variable salary are omitted.
- It is identified that the minimum observed value for salary needs to be corrected.
- Through a visual inspection of “title” (174 distinct values), it is deemed appropriate to transform it into a new variable, “title_cat”, which takes on 4 categories: “Junior,” “Senior,” “Leadership,” or “Other,” based on the words appearing in “title” and the rules described below.
Missing values
- Missing values for age are imputed using regular expressions, searching the job description for expressions like “years old”.
- Missing values for education are imputed using regular expressions, searching the job description for words like “Master’s,” “Bachelor’s,” or “PhD”.
- Missing values for title_cat are manually imputed (this can be improved) based on the job description.
- Missing values for gender cannot be inferred from the job description.
- Missing values for job description are not addressed either.
- Missing values for age are imputed using regular expressions, searching the job description for expressions like “years old”.
Job description (text data)
- This column will be used to train a text regression model (a transformer-based model where the response variable is numerical, and the only explanatory variable is a text string).
- As it is, it can be used, but to make better use of the available information, I prepend a string created from the values of the other variables to the beginning of the job description string.
- This column will be used to train a text regression model (a transformer-based model where the response variable is numerical, and the only explanatory variable is a text string).
Libraries and modules
Read and merge data sources
# Read data
= pd.read_csv("../data/people.csv")
people = pd.read_csv("../data/descriptions.csv")
descr = pd.read_csv("../data/salary.csv")
salary
# merge
= (
datos
people="id", how="outer")
.merge(descr, on="id", how="outer")
.merge(salary, on={
.rename(columns"Age": "age",
"Gender": "gender",
"Education Level": "educ",
"Job Title": "title",
"Years of Experience": "exp",
"Description": "descr",
"Salary": "salary"
})=["salary"]) # Filtrar casos donde salary es NaN
.dropna(subset
)
datos.head()
id | age | gender | educ | title | exp | descr | salary | |
---|---|---|---|---|---|---|---|---|
0 | 0 | 32.0 | Male | Bachelor's | Software Engineer | 5.0 | I am a 32-year-old male working as a Software ... | 90000.0 |
1 | 1 | 28.0 | Female | Master's | Data Analyst | 3.0 | I am a 28-year-old data analyst with a Master'... | 65000.0 |
2 | 2 | 45.0 | Male | PhD | Senior Manager | 15.0 | I am a 45-year-old Senior Manager with a PhD a... | 150000.0 |
3 | 3 | 36.0 | Female | Bachelor's | Sales Associate | 7.0 | I am a 36-year-old female Sales Associate with... | 60000.0 |
4 | 4 | 52.0 | Male | Master's | Director | 20.0 | I am a 52-year-old male with over two decades ... | 200000.0 |
Validation
Categorical values
# values in categorical variables
'gender'].value_counts(dropna=False)
datos['educ'].value_counts(dropna=False)
datos['title'].value_counts(dropna=False).sort_values(ascending=False)
datos[
# new column title_cat
'title_cat'] = datos['title'].apply(
datos[lambda x: ("Senior" if "Senior" in x else
"Junior" if "Junior" in x else
"Leadership" if any(word in x for word in ["Manager", "CEO", "Chief", "Director", "Principal", "Associate"]) else
"Other") if pd.notna(x) else pd.NA
)
Numerical variables
# values in numerical variables
'salary', 'age', 'exp']].describe().style.format(precision=2) datos[[
salary | age | exp | |
---|---|---|---|
count | 373.00 | 370.00 | 373.00 |
mean | 100577.35 | 37.44 | 10.03 |
std | 48240.01 | 7.08 | 6.56 |
min | 350.00 | 23.00 | 0.00 |
25% | 55000.00 | 31.25 | 4.00 |
50% | 95000.00 | 36.00 | 9.00 |
75% | 140000.00 | 44.00 | 15.00 |
max | 250000.00 | 53.00 | 25.00 |
# salary validation
=(8, 6))
plt.figure(figsize'salary'], bins=30, color='blue', edgecolor='black', alpha=0.7)
plt.hist(datos['Salary distribution')
plt.title('Salary')
plt.xlabel('Frequency')
plt.ylabel( plt.show()
# min de salario es muy pequeño, revisar
'salary'] == datos['salary'].min()].values
datos.loc[datos[
# se debe corregir este valor minimo de salario, por la descripcion puede ser 35000
'salary'] = datos['salary'].replace(350, 35000) # todo: mejorar
datos[
# por las dudas veo el maximo
'salary'] == datos['salary'].max()] datos.loc[datos[
id | age | gender | educ | title | exp | descr | salary | title_cat | |
---|---|---|---|---|---|---|---|---|---|
30 | 30 | 50.0 | Male | Bachelor's | CEO | 25.0 | I am a 50-year-old male with a Bachelor's degr... | 250000.0 | Leadership |
83 | 83 | 52.0 | Male | PhD | Chief Technology Officer | 24.0 | I am a 52-year-old male with a PhD and over tw... | 250000.0 | Leadership |
Missing values
explore_missing(datos)
Complete cases: 358
Proportion of complete cases: 0.9597855227882037
Proportion of complete cells: 0.9946380697050938
Missing values by column:
Variable Missing Count Missing Proportion
0 id 0 0.000000
1 age 3 0.008043
2 gender 3 0.008043
3 educ 3 0.008043
4 title 3 0.008043
5 exp 0 0.000000
6 descr 3 0.008043
7 salary 0 0.000000
8 title_cat 3 0.008043
# Age imputation
= datos['age'].isna()
na_age_idx = datos.loc[na_age_idx, 'descr']
na_age_descr = na_age_descr.str.extract(r"(\d{2})[- ]year[- ]old")[0].astype(float)
edades 'age'] = edades
datos.loc[na_age_idx,
# Educ imputation
'educ'] = datos.apply(
datos[lambda row: "Master's" if pd.isna(row['educ']) and "Master" in row['descr'] else
"Bachelor's" if pd.isna(row['educ']) and "Bachelor" in row['descr'] else
"PhD" if pd.isna(row['educ']) and "Ph" in row['descr'] else
'educ'],
row[=1
axis
)
# Job Title imputation (manual, todo: mejorar)
'title'].isna(), 'title_cat'] = ['Other', 'Leadership', 'Senior']
datos.loc[datos[
explore_missing(datos)
Complete cases: 363
Proportion of complete cases: 0.9731903485254692
Proportion of complete cells: 0.9970211498361632
Missing values by column:
Variable Missing Count Missing Proportion
0 id 0 0.000000
1 age 1 0.002681
2 gender 3 0.008043
3 educ 0 0.000000
4 title 3 0.008043
5 exp 0 0.000000
6 descr 3 0.008043
7 salary 0 0.000000
8 title_cat 0 0.000000
Job description (text data)
# Crear nueva columna con descripción completa
'text'] = (
datos["Age: " + datos['age'].astype(str) +
" - Gender: " + datos['gender'].astype(str) +
" - Education level: " + datos['educ'].astype(str) +
" - Title: " + datos['title'].astype(str) +
" - Years of experience: " + datos['exp'].astype(str) +
" - Job description: " + datos['descr'].astype(str)
)
Saving transformed data
Save clean data
= datos.drop(columns=["id"])
datos '../data/clean_data.csv', index = False) datos.to_csv(
Create sample data
These files will can be used as examples when using the app for making predictions with the trained models.
# Seleccionar al azar 5 filas y columnas específicas
= datos.sample(n = 5, random_state = 200)
sample_rows 'age', 'gender', 'educ', 'title_cat', 'exp']].to_csv('../data/sample_data_vars.csv', index = False)
sample_rows[['text']].to_csv('../data/sample_data_onlytext.csv', index = False)
sample_rows[['age', 'gender']].to_csv('../data/sample_data_wrong.csv', index = False) sample_rows[[