options (scipen = 999)
library(tidyverse)
library(tidymodels)
library(tidytext)
library(janitor)
library(skimr)
library(vip)
library(parallel) # - new
library(doParallel) # - new
library(rpart.plot)
library(textrecipes)
library(stringi)
library(xgboost)
job_holdout <- read.csv("job_holdout.csv")
job_training <- read.csv("job_training.csv")
job_training <- job_training %>% janitor::clean_names()
job_training %>% skimr::skim_without_charts()
Name | Piped data |
Number of rows | 12516 |
Number of columns | 18 |
_______________________ | |
Column type frequency: | |
character | 13 |
numeric | 5 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
title | 0 | 1.00 | 3 | 142 | 0 | 8069 | 0 |
location | 234 | 0.98 | 2 | 161 | 0 | 2485 | 0 |
department | 8070 | 0.36 | 1 | 34 | 0 | 732 | 0 |
salary_range | 10537 | 0.16 | 3 | 16 | 0 | 700 | 0 |
company_profile | 2328 | 0.81 | 14 | 5578 | 0 | 1514 | 0 |
description | 0 | 1.00 | 3 | 22693 | 0 | 10564 | 0 |
requirements | 1892 | 0.85 | 1 | 6356 | 0 | 8568 | 0 |
benefits | 5057 | 0.60 | 2 | 4489 | 0 | 4643 | 0 |
employment_type | 2403 | 0.81 | 5 | 9 | 0 | 5 | 0 |
required_experience | 4955 | 0.60 | 8 | 16 | 0 | 7 | 0 |
required_education | 5665 | 0.55 | 9 | 33 | 0 | 13 | 0 |
industry | 3424 | 0.73 | 5 | 36 | 0 | 129 | 0 |
job_function | 4487 | 0.64 | 5 | 22 | 0 | 37 | 0 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 |
---|---|---|---|---|---|---|---|---|---|
job_id | 0 | 1 | 8940.07 | 5154.13 | 1 | 4483.5 | 8949.5 | 13384.25 | 17880 |
telecommuting | 0 | 1 | 0.04 | 0.21 | 0 | 0.0 | 0.0 | 0.00 | 1 |
has_company_logo | 0 | 1 | 0.79 | 0.40 | 0 | 1.0 | 1.0 | 1.00 | 1 |
has_questions | 0 | 1 | 0.49 | 0.50 | 0 | 0.0 | 0.0 | 1.00 | 1 |
fraudulent | 0 | 1 | 0.05 | 0.22 | 0 | 0.0 | 0.0 | 0.00 | 1 |
job_training <- job_training %>% select(-location)
job_holdout <- job_holdout %>% select(-location)
job_training <- job_training %>%
mutate(has_salary_range = ifelse(is.na(salary_range), 0, 1))%>%
select(-salary_range)
job_holdout <- job_holdout %>%
mutate(has_salary_range = ifelse(is.na(salary_range), 0, 1))%>%
select(-salary_range)
job_training <- job_training %>%
mutate(has_department = ifelse(is.na(department), 0, 1)) %>%
select(-department)
job_holdout <- job_holdout %>%
mutate(has_department = ifelse(is.na(department), 0, 1)) %>%
select(-department)
#---
job_training$employment_type[is.na(job_training$employment_type)] <- "Other"
job_holdout$employment_type[is.na(job_holdout$employment_type)] <- "Other"
#---
job_training$required_experience[is.na(job_training$required_experience)] <- "Not Applicable"
job_holdout$required_experience[is.na(job_holdout$required_experience)] <- "Not Applicable"
#---
job_training$required_education[is.na(job_training$required_education)] <- "Unspecified"
job_holdout$required_education[is.na(job_holdout$required_education)] <- "Unspecified"
####---
industry_fraud <- job_training %>%
group_by(fraudulent, industry) %>%
select(industry) %>%
summarize(n=n()) %>%
pivot_wider(names_from = fraudulent, values_from= n, values_fill = 0.0) %>%
mutate(industry_pct_fraud = `1`/(`1`+`0`)) %>%
select(industry, industry_pct_fraud)
job_training <- job_training %>%
left_join(industry_fraud) %>%
select(-industry)
job_holdout <- job_holdout %>%
left_join(industry_fraud) %>%
select(-industry)
remove(industry_fraud)
###---
job_function_fraud <- job_training %>%
group_by(fraudulent, job_function) %>%
summarize(n=n()) %>%
pivot_wider(names_from = fraudulent, values_from = n, values_fill = 0.0) %>%
mutate(jfunction_pct_fraud = `1`/(`0`+`1`)) %>%
select(job_function, jfunction_pct_fraud)
job_training <- job_training %>%
left_join(job_function_fraud) %>%
select(-job_function)
job_holdout <- job_holdout %>%
left_join(job_function_fraud) %>%
select(-job_function)
remove(job_function_fraud)
job_training <- job_training %>%
mutate(company_profile_words = ifelse(is.na(company_profile), 0, str_count(company_profile, boundary("word"))),description_words =
ifelse(is.na(description), 0, str_count(description, boundary("word"))),
requirement_words =
ifelse(is.na(requirements), 0, str_count(requirements, boundary("word"))),
benefit_words = ifelse(is.na(benefits), 0, str_count(benefits, boundary("word"))))
job_holdout <- job_holdout %>%
mutate(company_profile_words = ifelse(is.na(company_profile), 0, str_count(company_profile, boundary("word"))),description_words =
ifelse(is.na(description), 0, str_count(description, boundary("word"))),
requirement_words =
ifelse(is.na(requirements), 0, str_count(requirements, boundary("word"))),
benefit_words = ifelse(is.na(benefits), 0, str_count(benefits, boundary("word"))))
job_training <- job_training %>%
mutate(title = stri_replace_all_regex(title, '[^a-zA-Z0-9*$]', " ")) %>%
mutate(company_profile = stri_replace_all_regex(company_profile, '[^a-zA-Z0-9*$]', " ")) %>%
mutate(description = stri_replace_all_regex(description, '[^a-zA-Z0-9*$]', " ")) %>%
mutate(requirements = stri_replace_all_regex(requirements, '[^a-zA-Z0-9*$]', " ")) %>%
mutate(benefits = stri_replace_all_regex(benefits, '[^a-zA-Z0-9*$]', " "))
job_holdout <- job_holdout %>%
mutate(title = stri_replace_all_regex(title, '[^a-zA-Z0-9*$]', " ")) %>%
mutate(company_profile = stri_replace_all_regex(company_profile, '[^a-zA-Z0-9*$]', " ")) %>%
mutate(description = stri_replace_all_regex(description, '[^a-zA-Z0-9*$]', " ")) %>%
mutate(requirements = stri_replace_all_regex(requirements, '[^a-zA-Z0-9*$]', " ")) %>%
mutate(benefits = stri_replace_all_regex(benefits, '[^a-zA-Z0-9*$]', " "))
training_df <- job_training %>%
mutate_if(is.character, as_factor) %>%
mutate(fraudulent = as.factor(fraudulent)) %>%
mutate(telecommuting = as.factor(telecommuting)) %>%
mutate(has_company_logo = as.factor(has_company_logo)) %>%
mutate(has_questions = as.factor(has_questions)) %>%
mutate(has_salary_range = as.factor(has_salary_range)) %>%
mutate(has_department = as.factor(has_department))
holdout_df <- job_holdout %>%
mutate_if(is.character, as_factor) %>%
mutate(telecommuting = as.factor(telecommuting)) %>%
mutate(has_company_logo = as.factor(has_company_logo)) %>%
mutate(has_questions = as.factor(has_questions)) %>%
mutate(has_salary_range = as.factor(has_salary_range)) %>%
mutate(has_department = as.factor(has_department))
skimr::skim_without_charts(data=training_df)
Name | training_df |
Number of rows | 12516 |
Number of columns | 21 |
_______________________ | |
Column type frequency: | |
factor | 14 |
numeric | 7 |
________________________ | |
Group variables | None |
Variable type: factor
skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
---|---|---|---|---|---|
title | 0 | 1.00 | FALSE | 8040 | Eng: 293, Cus: 150, Gra: 103, Sof: 73 |
company_profile | 2328 | 0.81 | FALSE | 1514 | We : 528, We : 485, Nov: 416, Est: 321 |
description | 0 | 1.00 | FALSE | 10563 | Pla: 275, Pla: 49, Pla: 32, Pla: 24 |
requirements | 1892 | 0.85 | FALSE | 8567 | Uni: 296, Uni: 117, 16 : 77, Min: 60 |
benefits | 5057 | 0.60 | FALSE | 4639 | See: 528, Car: 104, CSD: 50, Fut: 44 |
telecommuting | 0 | 1.00 | FALSE | 2 | 0: 11962, 1: 554 |
has_company_logo | 0 | 1.00 | FALSE | 2 | 1: 9939, 0: 2577 |
has_questions | 0 | 1.00 | FALSE | 2 | 0: 6348, 1: 6168 |
employment_type | 0 | 1.00 | FALSE | 5 | Ful: 8127, Oth: 2572, Con: 1086, Par: 564 |
required_experience | 0 | 1.00 | FALSE | 7 | Not: 5738, Mid: 2651, Ent: 1923, Ass: 1568 |
required_education | 0 | 1.00 | FALSE | 13 | Uns: 6641, Bac: 3600, Hig: 1471, Mas: 285 |
fraudulent | 0 | 1.00 | FALSE | 2 | 0: 11899, 1: 617 |
has_salary_range | 0 | 1.00 | FALSE | 2 | 0: 10537, 1: 1979 |
has_department | 0 | 1.00 | FALSE | 2 | 0: 8070, 1: 4446 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 |
---|---|---|---|---|---|---|---|---|---|
job_id | 0 | 1 | 8940.07 | 5154.13 | 1 | 4483.50 | 8949.50 | 13384.25 | 17880.00 |
industry_pct_fraud | 0 | 1 | 0.05 | 0.07 | 0 | 0.00 | 0.05 | 0.06 | 1.00 |
jfunction_pct_fraud | 0 | 1 | 0.05 | 0.04 | 0 | 0.02 | 0.05 | 0.05 | 0.18 |
company_profile_words | 0 | 1 | 96.09 | 92.80 | 0 | 20.00 | 86.00 | 138.00 | 891.00 |
description_words | 0 | 1 | 186.19 | 226.97 | 1 | 90.00 | 150.00 | 237.00 | 10607.00 |
requirement_words | 0 | 1 | 82.94 | 88.38 | 0 | 19.00 | 65.00 | 115.00 | 1001.00 |
benefit_words | 0 | 1 | 31.92 | 54.85 | 0 | 0.00 | 6.00 | 44.00 | 730.00 |
set.seed(42)
df_split <- initial_split(data = training_df, prop = 0.7, strata = fraudulent)
train <- training(df_split)
test <- testing(df_split)
set.seed(42)
train_fold <- vfold_cv(data = train, v = 5)
job_training %>%
ggplot(aes(fraudulent)) +
geom_bar(stat = "count") +
labs(title = "Distribution of Fraudulent Jobs")
job_training %>%
group_by(fraudulent) %>%
summarize(n=n()) %>%
mutate(pct = round(n/sum(n), 2))
## # A tibble: 2 x 3
## fraudulent n pct
## <int> <int> <dbl>
## 1 0 11899 0.95
## 2 1 617 0.05
job_training %>%
select(where(is.numeric), -job_id) %>%
lares::corr_var(fraudulent)
plot_numeric <- function(col){
training_df %>%
ggplot(aes(x=fraudulent, y=!!as.name(col))) +
geom_boxplot()
}
for (column in names(training_df %>% select(where(is.numeric), -job_id))){
print(plot_numeric(column))
}
Industry percentage fraud and job function percentage fraud
training_df %>%
select(fraudulent, where(is.numeric), -job_id) %>%
group_by(fraudulent) %>%
summarize(industry_pct = round(mean(industry_pct_fraud), 2),
job_function_pct = round(mean(jfunction_pct_fraud), 2)) %>%
ungroup() %>%
mutate_if(is.numeric, round, 2)
## # A tibble: 2 x 3
## fraudulent industry_pct job_function_pct
## <fct> <dbl> <dbl>
## 1 0 0.04 0.05
## 2 1 0.14 0.08
Number of words used to describe company profile, job description job requirements, and benefits
training_df %>%
select(fraudulent, where(is.numeric), -job_id) %>%
group_by(fraudulent) %>%
summarize(company_profile = round(mean(company_profile_words), 0),
description = round(mean(description_words), 0),
requirement = round(mean(requirement_words), 0),
benefit = round(mean(benefit_words), 0)) %>%
ungroup() %>%
mutate_if(is.numeric, round, 2)
## # A tibble: 2 x 5
## fraudulent company_profile description requirement benefit
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 0 99 187 84 32
## 2 1 33 170 64 32
plot_factor <- function(col){
training_df %>%
ggplot(., aes(!!as.name(col))) +
geom_bar(aes(fill = fraudulent), position = "fill") +
coord_flip()
}
for(column in names(training_df %>%
select(telecommuting, has_company_logo, has_questions,
employment_type, required_experience, required_education,
has_salary_range,has_department)))
{
print(plot_factor(column))
}
Telecommuting
training_df %>%
select(where(is.factor)) %>%
group_by(fraudulent, telecommuting) %>%
summarize(n=n()) %>%
pivot_wider(names_from = fraudulent, values_from = n) %>%
mutate(pct = round(`1`/(`0`+`1`), 2), legit = `0`, fraud = `1`) %>%
select(telecommuting, legit, fraud, pct) %>%
arrange(desc(pct))
## # A tibble: 2 x 4
## telecommuting legit fraud pct
## <fct> <int> <int> <dbl>
## 1 1 508 46 0.08
## 2 0 11391 571 0.05
has_company_logo
training_df %>%
select(where(is.factor)) %>%
group_by(fraudulent, has_company_logo) %>%
summarize(n=n()) %>%
pivot_wider(names_from = fraudulent, values_from = n) %>%
mutate(pct = round(`1`/(`0`+`1`), 2), legit = `0`, fraud = `1`) %>%
select(has_company_logo, legit, fraud, pct)%>%
arrange(desc(pct))
## # A tibble: 2 x 4
## has_company_logo legit fraud pct
## <fct> <int> <int> <dbl>
## 1 0 2169 408 0.16
## 2 1 9730 209 0.02
has_questions
training_df %>%
select(where(is.factor)) %>%
group_by(fraudulent, has_questions) %>%
summarize(n=n()) %>%
pivot_wider(names_from = fraudulent, values_from = n) %>%
mutate(pct = round(`1`/(`0`+`1`), 2), legit = `0`, fraud = `1`) %>%
select(has_questions, legit, fraud, pct)%>%
arrange(desc(pct))
## # A tibble: 2 x 4
## has_questions legit fraud pct
## <fct> <int> <int> <dbl>
## 1 0 5915 433 0.07
## 2 1 5984 184 0.03
employment_type
training_df %>%
select(where(is.factor)) %>%
group_by(fraudulent, employment_type) %>%
summarize(n=n()) %>%
pivot_wider(names_from = fraudulent, values_from = n) %>%
mutate(pct = round(`1`/(`0`+`1`), 2), legit = `0`, fraud = `1`) %>%
select(employment_type, legit, fraud, pct)%>%
arrange(desc(pct))
## # A tibble: 5 x 4
## employment_type legit fraud pct
## <fct> <int> <int> <dbl>
## 1 Part-time 512 52 0.09
## 2 Other 2391 181 0.07
## 3 Full-time 7774 353 0.04
## 4 Contract 1056 30 0.03
## 5 Temporary 166 1 0.01
required_experience
training_df %>%
select(where(is.factor)) %>%
group_by(fraudulent, required_experience) %>%
summarize(n=n()) %>%
pivot_wider(names_from = fraudulent, values_from = n) %>%
mutate(pct = round(`1`/(`0`+`1`), 2), legit = `0`, fraud = `1`) %>%
select(required_experience, legit, fraud, pct)%>%
arrange(desc(pct))
## # A tibble: 7 x 4
## required_experience legit fraud pct
## <fct> <int> <int> <dbl>
## 1 Executive 88 8 0.08
## 2 Entry level 1798 125 0.07
## 3 Not Applicable 5385 353 0.06
## 4 Director 262 15 0.05
## 5 Internship 256 7 0.03
## 6 Mid-Senior level 2566 85 0.03
## 7 Associate 1544 24 0.02
required_education
training_df %>%
select(where(is.factor)) %>%
group_by(fraudulent, required_education) %>%
summarize(n=n()) %>%
pivot_wider(names_from = fraudulent, values_from = n) %>%
mutate(pct = round(`1`/(`0`+`1`), 2), legit = `0`, fraud = `1`) %>%
select(required_education, legit, fraud, pct)%>%
arrange(desc(pct))
## # A tibble: 13 x 4
## required_education legit fraud pct
## <fct> <int> <int> <dbl>
## 1 Some High School Coursework 5 15 0.75
## 2 Certification 105 12 0.1
## 3 Master's Degree 258 27 0.09
## 4 High School or equivalent 1354 117 0.08
## 5 Doctorate 17 1 0.06
## 6 Unspecified 6280 361 0.05
## 7 Professional 59 3 0.05
## 8 Some College Coursework Completed 62 2 0.03
## 9 Associate Degree 186 5 0.03
## 10 Bachelor's Degree 3526 74 0.02
## 11 Vocational 36 NA NA
## 12 Vocational - Degree 3 NA NA
## 13 Vocational - HS Diploma 8 NA NA
has_salary_range
training_df %>%
select(where(is.factor)) %>%
group_by(fraudulent, has_salary_range) %>%
summarize(n=n()) %>%
pivot_wider(names_from = fraudulent, values_from = n) %>%
mutate(pct = round(`1`/(`0`+`1`), 2), legit = `0`, fraud = `1`) %>%
select(has_salary_range, legit, fraud, pct)%>%
arrange(desc(pct))
## # A tibble: 2 x 4
## has_salary_range legit fraud pct
## <fct> <int> <int> <dbl>
## 1 1 1822 157 0.08
## 2 0 10077 460 0.04
training_recipe <- training_df %>%
recipe(fraudulent ~.) %>%
update_role(job_id, new_role = "IGNORE") %>%
step_unknown(all_nominal_predictors()) %>%
step_impute_mean(all_numeric_predictors()) %>%
step_tokenize(title, company_profile, description, requirements, benefits) %>%
step_stopwords(title, company_profile, description, requirements, benefits) %>%
step_tokenfilter(title, company_profile, description, requirements, benefits, max_tokens = 5) %>%
step_tf(title, company_profile, description, requirements, benefits) %>%
step_other(all_nominal_predictors(), threshold = 0.01)
rf_model <- rand_forest(min_n = tune(), trees = tune()) %>%
set_mode("classification") %>%
set_engine("ranger", importance = "permutation")
rf_workflow <- workflow() %>%
add_recipe(training_recipe) %>%
add_model(rf_model)
all_cores <- detectCores(logical = FALSE)
cl <- makeCluster(all_cores)
doParallel::registerDoParallel(cl)
rf_grid <- grid_regular(
min_n(),
trees(c(5,20)),
levels = 5
)
rf_tuning_results <- rf_workflow %>%
tune_grid(
resamples = train_fold,
grid = rf_grid
)
rf_model <- rf_workflow %>%
finalize_workflow(select_best(rf_tuning_results))
rf_fit <- fit(rf_model, train)
Tuning metrics - Table
rf_tuning_results_table <- rf_tuning_results %>%
collect_metrics()%>%
mutate_if(is.numeric, round, 3)%>%
select(.metric, trees, min_n, mean, std_err) %>%
arrange(desc(mean))
rf_tuning_results_table %>%
filter(.metric == "roc_auc")
## # A tibble: 25 x 5
## .metric trees min_n mean std_err
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 roc_auc 20 11 0.969 0.002
## 2 roc_auc 20 30 0.965 0.004
## 3 roc_auc 20 40 0.964 0.002
## 4 roc_auc 16 11 0.962 0.004
## 5 roc_auc 16 30 0.962 0.002
## 6 roc_auc 12 21 0.961 0.004
## 7 roc_auc 16 21 0.961 0.004
## 8 roc_auc 16 40 0.961 0.003
## 9 roc_auc 20 21 0.961 0.003
## 10 roc_auc 12 40 0.959 0.002
## # ... with 15 more rows
Tuning metrics - Effect of “trees”
rf_tuning_results_table %>%
ggplot(aes(trees, mean)) +
geom_line() +
facet_wrap(~.metric, scales="free", nrow = 2)
Tuning metrics - Effect of “min_n”
rf_tuning_results_table %>%
ggplot(aes(min_n, mean)) +
geom_line() +
facet_wrap(~.metric, scales="free", nrow = 2)
Fit model metrics - Scores
options(yardstick.event_first = FALSE)
rf_train_scored <- predict(rf_fit, train, type = "prob") %>%
bind_cols(predict(rf_fit, train)) %>%
bind_cols(train) %>%
mutate(model = "random forest", scored_on = "train data") %>%
select(job_id, model, scored_on, fraudulent, .pred_class, .pred_1)
rf_test_scored <- predict(rf_fit, test, type = "prob") %>%
bind_cols(predict(rf_fit, test)) %>%
bind_cols(test) %>%
mutate(model = "random forest", scored_on = "test data") %>%
select(job_id, model, scored_on, fraudulent, .pred_class, .pred_1)
rf_scored <- rf_train_scored %>%
bind_rows(rf_test_scored)
Fit model metrics - important variables
rf_fit %>%
pull_workflow_fit() %>%
vip(10)
Fit model metrics - Classification Matrix
rf_train_scored %>%
conf_mat(fraudulent, .pred_class) %>%
autoplot(type="heatmap") +
labs(title = "Classification Matrix - Train Data")
rf_test_scored %>%
conf_mat(fraudulent, .pred_class) %>%
autoplot(type="heatmap") +
labs(title = "Classification Matrix - Test Data")
Fit model metrics - ROC_AUC Curve
rf_scored %>%
group_by(scored_on) %>%
roc_curve(fraudulent, .pred_1) %>%
autoplot() +
geom_vline(xintercept = 0.1, color="blue", size=0.5)
Metrics - accuracy, kap, mn_log_loss and roc_auc
rf_metrics_roc_logloss_etc <- rf_test_scored %>%
metrics(fraudulent, .pred_1, estimate =.pred_class) %>%
mutate(model = "random forest", scored_on = "test data") %>%
bind_rows(
rf_train_scored %>%
metrics(fraudulent, .pred_1, estimate =.pred_class) %>%
mutate(model = "random forest", scored_on = "train data")
) %>%
group_by(model, scored_on) %>%
pivot_wider(names_from = .metric, values_from = .estimate) %>%
select(model, .estimator, scored_on, accuracy, kap, mn_log_loss, roc_auc) %>%
mutate_if(is.numeric, round, 2)
rf_metrics_roc_logloss_etc
## # A tibble: 2 x 7
## # Groups: model, scored_on [2]
## model .estimator scored_on accuracy kap mn_log_loss roc_auc
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 random forest binary test data 0.97 0.61 0.1 0.96
## 2 random forest binary train data 0.99 0.88 0.04 1
Metrics - precision and recall
rf_metrics_precision_and_recall <- rf_train_scored %>%
precision(truth = fraudulent, estimate = .pred_class) %>%
bind_rows(rf_train_scored %>%
recall(truth = fraudulent, estimate = .pred_class)) %>%
mutate(model = "random forest", scored_on = "train data") %>%
bind_rows(
rf_test_scored %>%
precision(truth = fraudulent, estimate = .pred_class) %>%
bind_rows(rf_test_scored %>%
recall(truth = fraudulent, estimate = .pred_class)) %>%
mutate(model = "random forest", scored_on = "test data")
) %>%
group_by(model, scored_on) %>%
pivot_wider(names_from = .metric, values_from = .estimate) %>%
select(model, .estimator, scored_on, precision, recall) %>%
mutate_if(is.numeric, round, 2)
rf_metrics_precision_and_recall
## # A tibble: 2 x 5
## # Groups: model, scored_on [2]
## model .estimator scored_on precision recall
## <chr> <chr> <chr> <dbl> <dbl>
## 1 random forest binary train data 1 0.8
## 2 random forest binary test data 0.95 0.46
Fit model metrics - Threshold
rf_test_scored %>%
roc_curve(fraudulent, .pred_1) %>%
mutate(fpr = 1-round(specificity, 2),
tpr = round(sensitivity, 3),
threshold = 1 - round(.threshold, 3)) %>%
select(fpr, tpr, threshold) %>%
group_by(fpr) %>%
summarize(tpr = max(tpr),
threshold = max(threshold)) %>%
ungroup() %>%
mutate(precision = tpr/(fpr + tpr)) %>%
mutate_if(is.numeric, round, 2) %>%
arrange(fpr) %>%
filter(fpr <= 0.1)
## # A tibble: 11 x 4
## fpr tpr threshold precision
## <dbl> <dbl> <dbl> <dbl>
## 1 0 0.54 0.61 1
## 2 0.01 0.63 0.73 0.98
## 3 0.02 0.68 0.79 0.97
## 4 0.03 0.72 0.83 0.96
## 5 0.04 0.74 0.85 0.95
## 6 0.05 0.77 0.87 0.94
## 7 0.06 0.8 0.88 0.93
## 8 0.07 0.84 0.9 0.92
## 9 0.08 0.86 0.91 0.92
## 10 0.09 0.88 0.92 0.91
## 11 0.1 0.88 0.92 0.9
xgboost_recipe <- training_df %>%
recipe(fraudulent ~.) %>%
update_role(job_id, new_role = "IGNORE") %>%
step_nzv(all_predictors()) %>%
step_unknown(all_nominal_predictors()) %>%
step_impute_mean(all_numeric_predictors()) %>%
step_tokenize(title, company_profile, description, requirements, benefits) %>%
step_stopwords(title, company_profile, description, requirements, benefits) %>%
step_tokenfilter(title, company_profile, description, requirements, benefits, max_tokens = 5) %>%
step_tf(title, company_profile, description, requirements, benefits) %>%
step_other(all_nominal_predictors(), threshold = 0.05) %>%
step_string2factor(all_nominal_predictors()) %>%
step_dummy(all_nominal_predictors())
xgboost_model <- boost_tree(trees = 5,
tree_depth = tune(),
learn_rate = tune()) %>%
set_engine("xgboost") %>%
set_mode("classification")
xgboost_wf <- workflow() %>%
add_recipe(xgboost_recipe) %>%
add_model(xgboost_model)
xgboost_grid <- grid_regular(
tree_depth(),
learn_rate(),
levels=5)
all_cores <- detectCores(logical = FALSE)
cl <- makeCluster(all_cores)
doParallel::registerDoParallel(cl)
xgboost_tuning_results <- xgboost_wf %>%
tune_grid(
resamples = train_fold,
grid = xgboost_grid
)
xgboost_model <- xgboost_wf %>%
finalize_workflow(select_best(xgboost_tuning_results))
xgboost_fit <- fit(xgboost_model, train)
## [20:40:58] WARNING: amalgamation/../src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
Tuning metrics Scores - Table
xgboost_tuning_results %>%
collect_metrics() %>%
mutate_if(is.numeric, round, 3) %>%
select(.metric, tree_depth, learn_rate, mean, std_err) %>%
filter(.metric == "roc_auc") %>%
arrange(desc(mean))
## # A tibble: 25 x 5
## .metric tree_depth learn_rate mean std_err
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 roc_auc 11 0.1 0.911 0.006
## 2 roc_auc 15 0.1 0.911 0.007
## 3 roc_auc 8 0.1 0.91 0.006
## 4 roc_auc 4 0.1 0.89 0.004
## 5 roc_auc 15 0 0.884 0.009
## 6 roc_auc 11 0.001 0.884 0.011
## 7 roc_auc 15 0.001 0.884 0.01
## 8 roc_auc 8 0 0.883 0.01
## 9 roc_auc 11 0 0.883 0.01
## 10 roc_auc 8 0.001 0.883 0.011
## # ... with 15 more rows
Tuning metrics Scores - Effects of “tree_depth”
xgboost_tuning_results %>%
collect_metrics() %>%
mutate_if(is.numeric, round, 3) %>%
select(.metric, tree_depth, learn_rate, mean, std_err) %>%
ggplot(aes(tree_depth, mean)) +
geom_line() +
facet_wrap(~.metric, scales = "free", nrow = 2)
Tuning metrics Scores - Effect of “learn_rate”
xgboost_tuning_results %>%
collect_metrics() %>%
mutate_if(is.numeric, round, 3) %>%
select(.metric, tree_depth, learn_rate, mean, std_err) %>%
ggplot(aes(learn_rate, mean)) +
geom_line() +
facet_wrap(~.metric, scales = "free", nrow = 2)
options(yardstick.event_first = FALSE)
xgboost_train_scored <- predict(xgboost_fit, train, type = "prob") %>%
bind_cols(predict(xgboost_fit, train)) %>%
bind_cols(train) %>%
mutate(model = "xgboost", scored_on = "train data") %>%
select(job_id, model, scored_on, fraudulent, .pred_class, .pred_1)
xgboost_test_scored <- predict(xgboost_fit, test, type = "prob") %>%
bind_cols(predict(xgboost_fit, test)) %>%
bind_cols(test) %>%
mutate(model = "xgboost", scored_on = "test data") %>%
select(job_id, model, scored_on, fraudulent, .pred_class, .pred_1)
xgboost_scored <- xgboost_train_scored %>%
bind_rows(xgboost_test_scored)
Important variables
xgboost_fit %>%
pull_workflow_fit() %>%
vip(10)
Fit model metrics - ROC_AUC Curve
xgboost_scored %>%
group_by(scored_on) %>%
roc_curve(fraudulent, .pred_1) %>%
autoplot() +
geom_vline(xintercept = 0.1, color="blue", size=0.5)
Metrics - accuracy, kap, mn_log_loss and roc_auc
xgboost_metrics_roc_logloss_etc <- xgboost_test_scored %>%
metrics(fraudulent, .pred_1, estimate =.pred_class) %>%
mutate(model = "xgboost", scored_on = "test data") %>%
bind_rows(
xgboost_train_scored %>%
metrics(fraudulent, .pred_1, estimate =.pred_class) %>%
mutate(model = "xgboost", scored_on = "train data")
) %>%
group_by(model, scored_on) %>%
pivot_wider(names_from = .metric, values_from = .estimate) %>%
select(model, .estimator, scored_on, accuracy, kap, mn_log_loss, roc_auc) %>%
mutate_if(is.numeric, round, 2)
xgboost_metrics_roc_logloss_etc
## # A tibble: 2 x 7
## # Groups: model, scored_on [2]
## model .estimator scored_on accuracy kap mn_log_loss roc_auc
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 xgboost binary test data 0.97 0.55 0.39 0.9
## 2 xgboost binary train data 0.98 0.79 0.38 0.94
Metrics - precision and recall
xgboost_metrics_precision_and_recall <- xgboost_train_scored %>%
precision(truth = fraudulent, estimate = .pred_class) %>%
bind_rows(rf_train_scored %>%
recall(truth = fraudulent, estimate = .pred_class)) %>%
mutate(model = "xgboost", scored_on = "train data") %>%
bind_rows(
xgboost_test_scored %>%
precision(truth = fraudulent, estimate = .pred_class) %>%
bind_rows(rf_test_scored %>%
recall(truth = fraudulent, estimate = .pred_class)) %>%
mutate(model = "xgboost", scored_on = "test data")
) %>%
group_by(model, scored_on) %>%
pivot_wider(names_from = .metric, values_from = .estimate) %>%
select(model, .estimator, scored_on, precision, recall) %>%
mutate_if(is.numeric, round, 2)
xgboost_metrics_precision_and_recall
## # A tibble: 2 x 5
## # Groups: model, scored_on [2]
## model .estimator scored_on precision recall
## <chr> <chr> <chr> <dbl> <dbl>
## 1 xgboost binary train data 0.93 0.8
## 2 xgboost binary test data 0.71 0.46
Fit model metrics - threshold
rf_test_scored %>%
roc_curve(fraudulent, .pred_1) %>%
mutate(fpr = 1-round(specificity, 2),
tpr = round(sensitivity, 3),
threshold = 1 - round(.threshold, 3)) %>%
select(fpr, tpr, threshold) %>%
group_by(fpr) %>%
summarize(tpr = max(tpr),
threshold = max(threshold)) %>%
ungroup() %>%
mutate(precision = round(tpr/(fpr + tpr), 2)) %>%
select(fpr, tpr, precision, threshold) %>%
arrange(fpr) %>%
filter(fpr <= 0.1)
## # A tibble: 11 x 4
## fpr tpr precision threshold
## <dbl> <dbl> <dbl> <dbl>
## 1 0 0.536 1 0.612
## 2 0.0100 0.63 0.98 0.731
## 3 0.0200 0.68 0.97 0.789
## 4 0.0300 0.718 0.96 0.827
## 5 0.0400 0.74 0.95 0.847
## 6 0.0500 0.773 0.94 0.866
## 7 0.0600 0.796 0.93 0.883
## 8 0.0700 0.84 0.92 0.898
## 9 0.0800 0.862 0.92 0.908
## 10 0.09 0.878 0.91 0.916
## 11 0.1 0.884 0.9 0.92
rf_scored %>%
bind_rows(xgboost_scored) %>%
filter(scored_on == "test data") %>%
group_by(model)%>%
roc_curve(fraudulent, .pred_1) %>%
autoplot() +
geom_vline(xintercept = 0.1, color = "blue", size = 0.5)
precision_and_recall <- rf_metrics_precision_and_recall %>%
bind_rows(xgboost_metrics_precision_and_recall)
rf_scored %>%
bind_rows(xgboost_scored) %>%
group_by(model, scored_on) %>%
metrics(fraudulent, .pred_1, estimate = .pred_class ) %>%
pivot_wider(names_from = .metric, values_from = .estimate) %>%
select(model, .estimator, scored_on, accuracy, kap, mn_log_loss, roc_auc) %>%
mutate_if(is.numeric, round, 3) %>%
left_join(precision_and_recall) %>%
select(model, scored_on, accuracy, kap, mn_log_loss, roc_auc, precision, recall)
## # A tibble: 4 x 8
## model scored_on accuracy kap mn_log_loss roc_auc precision recall
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 random forest test data 0.973 0.607 0.101 0.96 0.95 0.46
## 2 random forest train data 0.99 0.883 0.037 1 1 0.8
## 3 xgboost test data 0.965 0.547 0.39 0.896 0.71 0.46
## 4 xgboost train data 0.983 0.794 0.38 0.937 0.93 0.8