options (scipen = 999)
library(tidyverse)
library(tidymodels)
library(tidytext)
library(janitor)
library(skimr)
library(vip)
library(parallel)  # - new 
library(doParallel) # - new 
library(rpart.plot)
library(textrecipes)
library(stringi)
library(xgboost)
job_holdout <- read.csv("job_holdout.csv")
job_training <- read.csv("job_training.csv")

job_training <- job_training %>% janitor::clean_names()
job_training %>% skimr::skim_without_charts()
Data summary
Name Piped data
Number of rows 12516
Number of columns 18
_______________________
Column type frequency:
character 13
numeric 5
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
title 0 1.00 3 142 0 8069 0
location 234 0.98 2 161 0 2485 0
department 8070 0.36 1 34 0 732 0
salary_range 10537 0.16 3 16 0 700 0
company_profile 2328 0.81 14 5578 0 1514 0
description 0 1.00 3 22693 0 10564 0
requirements 1892 0.85 1 6356 0 8568 0
benefits 5057 0.60 2 4489 0 4643 0
employment_type 2403 0.81 5 9 0 5 0
required_experience 4955 0.60 8 16 0 7 0
required_education 5665 0.55 9 33 0 13 0
industry 3424 0.73 5 36 0 129 0
job_function 4487 0.64 5 22 0 37 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100
job_id 0 1 8940.07 5154.13 1 4483.5 8949.5 13384.25 17880
telecommuting 0 1 0.04 0.21 0 0.0 0.0 0.00 1
has_company_logo 0 1 0.79 0.40 0 1.0 1.0 1.00 1
has_questions 0 1 0.49 0.50 0 0.0 0.0 1.00 1
fraudulent 0 1 0.05 0.22 0 0.0 0.0 0.00 1

Clean data

Remove unimportant data

job_training <- job_training %>% select(-location)
job_holdout <- job_holdout %>% select(-location)

Replace variables with their binary variables

job_training <- job_training %>%
  mutate(has_salary_range = ifelse(is.na(salary_range), 0, 1))%>%
  select(-salary_range)

job_holdout <- job_holdout %>%
  mutate(has_salary_range = ifelse(is.na(salary_range), 0, 1))%>%
  select(-salary_range)


job_training <- job_training %>%
  mutate(has_department = ifelse(is.na(department), 0, 1)) %>%
  select(-department)

job_holdout <- job_holdout %>%
  mutate(has_department = ifelse(is.na(department), 0, 1)) %>%
  select(-department)

Work on the "Not applicables’

#---
job_training$employment_type[is.na(job_training$employment_type)] <- "Other"
job_holdout$employment_type[is.na(job_holdout$employment_type)] <- "Other"

#---
job_training$required_experience[is.na(job_training$required_experience)] <- "Not Applicable"
job_holdout$required_experience[is.na(job_holdout$required_experience)] <- "Not Applicable"

#---
job_training$required_education[is.na(job_training$required_education)] <- "Unspecified"
job_holdout$required_education[is.na(job_holdout$required_education)] <- "Unspecified"

Target encoding

####---
industry_fraud <- job_training %>%
  group_by(fraudulent, industry) %>%
  select(industry) %>%
  summarize(n=n()) %>%
  pivot_wider(names_from = fraudulent, values_from= n, values_fill = 0.0) %>%
  mutate(industry_pct_fraud = `1`/(`1`+`0`)) %>%
  select(industry, industry_pct_fraud)

job_training <- job_training %>%
  left_join(industry_fraud) %>%
  select(-industry)

job_holdout <- job_holdout %>%
  left_join(industry_fraud) %>%
  select(-industry)

remove(industry_fraud)


###---
job_function_fraud <- job_training %>%
  group_by(fraudulent, job_function) %>%
  summarize(n=n()) %>%
  pivot_wider(names_from = fraudulent, values_from = n, values_fill = 0.0) %>%
  mutate(jfunction_pct_fraud = `1`/(`0`+`1`)) %>%
  select(job_function, jfunction_pct_fraud)

job_training <- job_training %>%
  left_join(job_function_fraud) %>%
  select(-job_function)

job_holdout <- job_holdout %>%
  left_join(job_function_fraud) %>%
  select(-job_function)

remove(job_function_fraud)

Add number of words used to describe the company and the job position

job_training <- job_training %>%
  mutate(company_profile_words = ifelse(is.na(company_profile), 0, str_count(company_profile, boundary("word"))),description_words = 
      ifelse(is.na(description), 0, str_count(description, boundary("word"))),
    requirement_words =
      ifelse(is.na(requirements), 0, str_count(requirements, boundary("word"))),
    benefit_words = ifelse(is.na(benefits), 0, str_count(benefits, boundary("word"))))

  
job_holdout <- job_holdout %>%
  mutate(company_profile_words = ifelse(is.na(company_profile), 0, str_count(company_profile, boundary("word"))),description_words = 
      ifelse(is.na(description), 0, str_count(description, boundary("word"))),
    requirement_words =
      ifelse(is.na(requirements), 0, str_count(requirements, boundary("word"))),
    benefit_words = ifelse(is.na(benefits), 0, str_count(benefits, boundary("word"))))

Clean text data

job_training <- job_training %>%
  mutate(title = stri_replace_all_regex(title, '[^a-zA-Z0-9*$]', " ")) %>%
  mutate(company_profile = stri_replace_all_regex(company_profile, '[^a-zA-Z0-9*$]', " ")) %>%
  mutate(description = stri_replace_all_regex(description, '[^a-zA-Z0-9*$]', " ")) %>%
  mutate(requirements = stri_replace_all_regex(requirements, '[^a-zA-Z0-9*$]', " ")) %>%
  mutate(benefits = stri_replace_all_regex(benefits, '[^a-zA-Z0-9*$]', " "))

job_holdout <- job_holdout %>%
  mutate(title = stri_replace_all_regex(title, '[^a-zA-Z0-9*$]', " ")) %>%
  mutate(company_profile = stri_replace_all_regex(company_profile, '[^a-zA-Z0-9*$]', " ")) %>%
  mutate(description = stri_replace_all_regex(description, '[^a-zA-Z0-9*$]', " ")) %>%
  mutate(requirements = stri_replace_all_regex(requirements, '[^a-zA-Z0-9*$]', " ")) %>%
  mutate(benefits = stri_replace_all_regex(benefits, '[^a-zA-Z0-9*$]', " "))

to factor

training_df <- job_training %>%
  mutate_if(is.character, as_factor) %>%
  mutate(fraudulent = as.factor(fraudulent)) %>%
  mutate(telecommuting = as.factor(telecommuting)) %>%
  mutate(has_company_logo = as.factor(has_company_logo)) %>%
  mutate(has_questions = as.factor(has_questions)) %>%
  mutate(has_salary_range = as.factor(has_salary_range)) %>%
  mutate(has_department = as.factor(has_department))

holdout_df <- job_holdout %>%
  mutate_if(is.character, as_factor) %>%
  mutate(telecommuting = as.factor(telecommuting)) %>%
  mutate(has_company_logo = as.factor(has_company_logo)) %>%
  mutate(has_questions = as.factor(has_questions)) %>%
  mutate(has_salary_range = as.factor(has_salary_range)) %>%
  mutate(has_department = as.factor(has_department))
skimr::skim_without_charts(data=training_df)
Data summary
Name training_df
Number of rows 12516
Number of columns 21
_______________________
Column type frequency:
factor 14
numeric 7
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
title 0 1.00 FALSE 8040 Eng: 293, Cus: 150, Gra: 103, Sof: 73
company_profile 2328 0.81 FALSE 1514 We : 528, We : 485, Nov: 416, Est: 321
description 0 1.00 FALSE 10563 Pla: 275, Pla: 49, Pla: 32, Pla: 24
requirements 1892 0.85 FALSE 8567 Uni: 296, Uni: 117, 16 : 77, Min: 60
benefits 5057 0.60 FALSE 4639 See: 528, Car: 104, CSD: 50, Fut: 44
telecommuting 0 1.00 FALSE 2 0: 11962, 1: 554
has_company_logo 0 1.00 FALSE 2 1: 9939, 0: 2577
has_questions 0 1.00 FALSE 2 0: 6348, 1: 6168
employment_type 0 1.00 FALSE 5 Ful: 8127, Oth: 2572, Con: 1086, Par: 564
required_experience 0 1.00 FALSE 7 Not: 5738, Mid: 2651, Ent: 1923, Ass: 1568
required_education 0 1.00 FALSE 13 Uns: 6641, Bac: 3600, Hig: 1471, Mas: 285
fraudulent 0 1.00 FALSE 2 0: 11899, 1: 617
has_salary_range 0 1.00 FALSE 2 0: 10537, 1: 1979
has_department 0 1.00 FALSE 2 0: 8070, 1: 4446

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100
job_id 0 1 8940.07 5154.13 1 4483.50 8949.50 13384.25 17880.00
industry_pct_fraud 0 1 0.05 0.07 0 0.00 0.05 0.06 1.00
jfunction_pct_fraud 0 1 0.05 0.04 0 0.02 0.05 0.05 0.18
company_profile_words 0 1 96.09 92.80 0 20.00 86.00 138.00 891.00
description_words 0 1 186.19 226.97 1 90.00 150.00 237.00 10607.00
requirement_words 0 1 82.94 88.38 0 19.00 65.00 115.00 1001.00
benefit_words 0 1 31.92 54.85 0 0.00 6.00 44.00 730.00

Split Data

set.seed(42)
df_split <- initial_split(data = training_df, prop = 0.7, strata = fraudulent)

train <- training(df_split)
test <- testing(df_split)

set.seed(42)
train_fold <- vfold_cv(data = train, v = 5)

Data Exploration

Target Variable - bar graph

job_training %>%
  ggplot(aes(fraudulent)) +
  geom_bar(stat = "count") +
  labs(title = "Distribution of Fraudulent Jobs")

Target variable - table

job_training %>%
  group_by(fraudulent) %>%
  summarize(n=n()) %>%
  mutate(pct = round(n/sum(n), 2))
## # A tibble: 2 x 3
##   fraudulent     n   pct
##        <int> <int> <dbl>
## 1          0 11899  0.95
## 2          1   617  0.05

Correlation

job_training %>%
  select(where(is.numeric), -job_id) %>%
  lares::corr_var(fraudulent)

Numeric Data

plot_numeric <- function(col){
  training_df %>%
    ggplot(aes(x=fraudulent, y=!!as.name(col))) +
    geom_boxplot()
}

for (column in names(training_df %>% select(where(is.numeric), -job_id))){
  print(plot_numeric(column))
}

Numeric Data - Percentage fraud by category

Industry percentage fraud and job function percentage fraud

training_df %>%
  select(fraudulent, where(is.numeric), -job_id) %>%
  group_by(fraudulent) %>%
  summarize(industry_pct = round(mean(industry_pct_fraud), 2),
            job_function_pct = round(mean(jfunction_pct_fraud), 2)) %>%
  ungroup() %>%
  mutate_if(is.numeric, round, 2)
## # A tibble: 2 x 3
##   fraudulent industry_pct job_function_pct
##   <fct>             <dbl>            <dbl>
## 1 0                  0.04             0.05
## 2 1                  0.14             0.08

Number of words used to describe company profile, job description job requirements, and benefits

training_df %>%
  select(fraudulent, where(is.numeric), -job_id) %>%
  group_by(fraudulent) %>%
  summarize(company_profile = round(mean(company_profile_words), 0),
            description = round(mean(description_words), 0),
            requirement = round(mean(requirement_words), 0),
            benefit = round(mean(benefit_words), 0)) %>%
  ungroup() %>%
  mutate_if(is.numeric, round, 2)
## # A tibble: 2 x 5
##   fraudulent company_profile description requirement benefit
##   <fct>                <dbl>       <dbl>       <dbl>   <dbl>
## 1 0                       99         187          84      32
## 2 1                       33         170          64      32

factor Data

plot_factor <- function(col){
  training_df %>%
    ggplot(., aes(!!as.name(col))) +
    geom_bar(aes(fill = fraudulent), position = "fill") +
    coord_flip()
}

for(column in names(training_df %>%
                    select(telecommuting, has_company_logo, has_questions,
                           employment_type, required_experience, required_education,
                           has_salary_range,has_department)))
{
  print(plot_factor(column))
}

Factor data - percentage fraud by category

Telecommuting

training_df %>%
  select(where(is.factor)) %>%
  group_by(fraudulent, telecommuting) %>%
  summarize(n=n()) %>%
  pivot_wider(names_from = fraudulent, values_from = n) %>%
  mutate(pct = round(`1`/(`0`+`1`), 2), legit = `0`, fraud = `1`) %>%
  select(telecommuting, legit, fraud, pct) %>%
  arrange(desc(pct))
## # A tibble: 2 x 4
##   telecommuting legit fraud   pct
##   <fct>         <int> <int> <dbl>
## 1 1               508    46  0.08
## 2 0             11391   571  0.05

has_company_logo

training_df %>%
  select(where(is.factor)) %>%
  group_by(fraudulent, has_company_logo) %>%
  summarize(n=n()) %>%
  pivot_wider(names_from = fraudulent, values_from = n) %>%
  mutate(pct = round(`1`/(`0`+`1`), 2), legit = `0`, fraud = `1`) %>%
  select(has_company_logo, legit, fraud, pct)%>%
  arrange(desc(pct))
## # A tibble: 2 x 4
##   has_company_logo legit fraud   pct
##   <fct>            <int> <int> <dbl>
## 1 0                 2169   408  0.16
## 2 1                 9730   209  0.02

has_questions

training_df %>%
  select(where(is.factor)) %>%
  group_by(fraudulent, has_questions) %>%
  summarize(n=n()) %>%
  pivot_wider(names_from = fraudulent, values_from = n) %>%
  mutate(pct = round(`1`/(`0`+`1`), 2), legit = `0`, fraud = `1`) %>%
  select(has_questions, legit, fraud, pct)%>%
  arrange(desc(pct))
## # A tibble: 2 x 4
##   has_questions legit fraud   pct
##   <fct>         <int> <int> <dbl>
## 1 0              5915   433  0.07
## 2 1              5984   184  0.03

employment_type

training_df %>%
  select(where(is.factor)) %>%
  group_by(fraudulent, employment_type) %>%
  summarize(n=n()) %>%
  pivot_wider(names_from = fraudulent, values_from = n) %>%
  mutate(pct = round(`1`/(`0`+`1`), 2), legit = `0`, fraud = `1`) %>%
  select(employment_type, legit, fraud, pct)%>%
  arrange(desc(pct))
## # A tibble: 5 x 4
##   employment_type legit fraud   pct
##   <fct>           <int> <int> <dbl>
## 1 Part-time         512    52  0.09
## 2 Other            2391   181  0.07
## 3 Full-time        7774   353  0.04
## 4 Contract         1056    30  0.03
## 5 Temporary         166     1  0.01

required_experience

training_df %>%
  select(where(is.factor)) %>%
  group_by(fraudulent, required_experience) %>%
  summarize(n=n()) %>%
  pivot_wider(names_from = fraudulent, values_from = n) %>%
  mutate(pct = round(`1`/(`0`+`1`), 2), legit = `0`, fraud = `1`) %>%
  select(required_experience, legit, fraud, pct)%>%
  arrange(desc(pct))
## # A tibble: 7 x 4
##   required_experience legit fraud   pct
##   <fct>               <int> <int> <dbl>
## 1 Executive              88     8  0.08
## 2 Entry level          1798   125  0.07
## 3 Not Applicable       5385   353  0.06
## 4 Director              262    15  0.05
## 5 Internship            256     7  0.03
## 6 Mid-Senior level     2566    85  0.03
## 7 Associate            1544    24  0.02

required_education

training_df %>%
  select(where(is.factor)) %>%
  group_by(fraudulent, required_education) %>%
  summarize(n=n()) %>%
  pivot_wider(names_from = fraudulent, values_from = n) %>%
  mutate(pct = round(`1`/(`0`+`1`), 2), legit = `0`, fraud = `1`) %>%
  select(required_education, legit, fraud, pct)%>%
  arrange(desc(pct))
## # A tibble: 13 x 4
##    required_education                legit fraud   pct
##    <fct>                             <int> <int> <dbl>
##  1 Some High School Coursework           5    15  0.75
##  2 Certification                       105    12  0.1 
##  3 Master's Degree                     258    27  0.09
##  4 High School or equivalent          1354   117  0.08
##  5 Doctorate                            17     1  0.06
##  6 Unspecified                        6280   361  0.05
##  7 Professional                         59     3  0.05
##  8 Some College Coursework Completed    62     2  0.03
##  9 Associate Degree                    186     5  0.03
## 10 Bachelor's Degree                  3526    74  0.02
## 11 Vocational                           36    NA NA   
## 12 Vocational - Degree                   3    NA NA   
## 13 Vocational - HS Diploma               8    NA NA

has_salary_range

training_df %>%
  select(where(is.factor)) %>%
  group_by(fraudulent, has_salary_range) %>%
  summarize(n=n()) %>%
  pivot_wider(names_from = fraudulent, values_from = n) %>%
  mutate(pct = round(`1`/(`0`+`1`), 2), legit = `0`, fraud = `1`) %>%
  select(has_salary_range, legit, fraud, pct)%>%
  arrange(desc(pct))
## # A tibble: 2 x 4
##   has_salary_range legit fraud   pct
##   <fct>            <int> <int> <dbl>
## 1 1                 1822   157  0.08
## 2 0                10077   460  0.04

Model Creation

Random Forest

training_recipe <- training_df %>%
  recipe(fraudulent ~.) %>%
  update_role(job_id, new_role = "IGNORE") %>%
  step_unknown(all_nominal_predictors()) %>%
  step_impute_mean(all_numeric_predictors()) %>%
  step_tokenize(title, company_profile, description, requirements, benefits) %>%
  step_stopwords(title, company_profile, description, requirements, benefits) %>%
  step_tokenfilter(title, company_profile, description, requirements, benefits, max_tokens = 5) %>%
  step_tf(title, company_profile, description, requirements, benefits) %>%
  step_other(all_nominal_predictors(), threshold = 0.01)


rf_model <- rand_forest(min_n = tune(), trees = tune()) %>%
  set_mode("classification") %>%
  set_engine("ranger", importance = "permutation")

rf_workflow <- workflow() %>%
  add_recipe(training_recipe) %>%
  add_model(rf_model)

all_cores <- detectCores(logical = FALSE)
cl <- makeCluster(all_cores)
doParallel::registerDoParallel(cl)
rf_grid <- grid_regular(
  min_n(),
  trees(c(5,20)),
  levels = 5
)

rf_tuning_results <- rf_workflow %>%
  tune_grid(
    resamples = train_fold,
    grid = rf_grid
    )

rf_model <- rf_workflow %>%
  finalize_workflow(select_best(rf_tuning_results))

rf_fit <- fit(rf_model, train)

Tuning Metrics

Tuning metrics - Table

rf_tuning_results_table <- rf_tuning_results %>%
  collect_metrics()%>%
  mutate_if(is.numeric, round, 3)%>%
  select(.metric, trees, min_n, mean, std_err) %>%
  arrange(desc(mean))

rf_tuning_results_table %>%
  filter(.metric == "roc_auc")
## # A tibble: 25 x 5
##    .metric trees min_n  mean std_err
##    <chr>   <dbl> <dbl> <dbl>   <dbl>
##  1 roc_auc    20    11 0.969   0.002
##  2 roc_auc    20    30 0.965   0.004
##  3 roc_auc    20    40 0.964   0.002
##  4 roc_auc    16    11 0.962   0.004
##  5 roc_auc    16    30 0.962   0.002
##  6 roc_auc    12    21 0.961   0.004
##  7 roc_auc    16    21 0.961   0.004
##  8 roc_auc    16    40 0.961   0.003
##  9 roc_auc    20    21 0.961   0.003
## 10 roc_auc    12    40 0.959   0.002
## # ... with 15 more rows

Tuning metrics - Effect of “trees”

rf_tuning_results_table %>%
  ggplot(aes(trees, mean)) +
  geom_line() +
  facet_wrap(~.metric, scales="free", nrow = 2)

Tuning metrics - Effect of “min_n”

rf_tuning_results_table %>%
  ggplot(aes(min_n, mean)) +
  geom_line() +
  facet_wrap(~.metric, scales="free", nrow = 2)

Fit model metrics

Fit model metrics - Scores

options(yardstick.event_first = FALSE)
rf_train_scored <- predict(rf_fit, train, type = "prob") %>%
  bind_cols(predict(rf_fit, train)) %>%
  bind_cols(train) %>%
  mutate(model = "random forest", scored_on = "train data") %>%
  select(job_id, model, scored_on, fraudulent, .pred_class, .pred_1)

rf_test_scored <- predict(rf_fit, test, type = "prob") %>%
  bind_cols(predict(rf_fit, test)) %>%
  bind_cols(test) %>%
  mutate(model = "random forest", scored_on = "test data") %>%
  select(job_id, model, scored_on, fraudulent, .pred_class, .pred_1)

rf_scored <- rf_train_scored %>%
  bind_rows(rf_test_scored)

Fit model metrics - important variables

rf_fit %>%
  pull_workflow_fit() %>%
  vip(10)

Fit model metrics - Classification Matrix

rf_train_scored %>%
  conf_mat(fraudulent, .pred_class) %>%
  autoplot(type="heatmap") +
  labs(title = "Classification Matrix - Train Data")

rf_test_scored %>%
  conf_mat(fraudulent, .pred_class) %>%
  autoplot(type="heatmap") +
  labs(title = "Classification Matrix - Test Data")

Fit model metrics - ROC_AUC Curve

rf_scored %>%
  group_by(scored_on) %>%
  roc_curve(fraudulent, .pred_1) %>%
  autoplot() +
  geom_vline(xintercept = 0.1, color="blue", size=0.5)

Metrics - accuracy, kap, mn_log_loss and roc_auc

rf_metrics_roc_logloss_etc <- rf_test_scored %>%
  metrics(fraudulent, .pred_1, estimate =.pred_class) %>%
  mutate(model = "random forest", scored_on = "test data") %>%
  bind_rows(
    rf_train_scored %>%
      metrics(fraudulent, .pred_1, estimate =.pred_class) %>%
      mutate(model = "random forest", scored_on = "train data")
  ) %>%
  group_by(model, scored_on) %>%
  pivot_wider(names_from = .metric, values_from =  .estimate) %>%
  select(model, .estimator, scored_on, accuracy, kap, mn_log_loss, roc_auc) %>%
  mutate_if(is.numeric, round, 2)

rf_metrics_roc_logloss_etc
## # A tibble: 2 x 7
## # Groups:   model, scored_on [2]
##   model         .estimator scored_on  accuracy   kap mn_log_loss roc_auc
##   <chr>         <chr>      <chr>         <dbl> <dbl>       <dbl>   <dbl>
## 1 random forest binary     test data      0.97  0.61        0.1     0.96
## 2 random forest binary     train data     0.99  0.88        0.04    1

Metrics - precision and recall

rf_metrics_precision_and_recall <- rf_train_scored %>%
  precision(truth = fraudulent, estimate = .pred_class) %>%
  bind_rows(rf_train_scored %>%
              recall(truth = fraudulent, estimate = .pred_class)) %>%
  mutate(model = "random forest", scored_on = "train data") %>%
  bind_rows(
    rf_test_scored %>%
      precision(truth = fraudulent, estimate = .pred_class) %>%
      bind_rows(rf_test_scored %>%
                  recall(truth = fraudulent, estimate = .pred_class)) %>%
      mutate(model = "random forest", scored_on = "test data")  
  ) %>%
  group_by(model, scored_on) %>%
  pivot_wider(names_from = .metric, values_from = .estimate) %>%
  select(model, .estimator, scored_on, precision, recall) %>%
  mutate_if(is.numeric, round, 2)

rf_metrics_precision_and_recall
## # A tibble: 2 x 5
## # Groups:   model, scored_on [2]
##   model         .estimator scored_on  precision recall
##   <chr>         <chr>      <chr>          <dbl>  <dbl>
## 1 random forest binary     train data      1      0.8 
## 2 random forest binary     test data       0.95   0.46

Fit model metrics - Threshold

rf_test_scored %>%
  roc_curve(fraudulent, .pred_1) %>%
  mutate(fpr = 1-round(specificity, 2),
         tpr = round(sensitivity, 3),
         threshold = 1 - round(.threshold, 3)) %>%
  select(fpr, tpr, threshold) %>%
  group_by(fpr) %>%
  summarize(tpr = max(tpr),
          threshold = max(threshold)) %>%
  ungroup() %>%
  mutate(precision = tpr/(fpr + tpr)) %>%
  mutate_if(is.numeric, round, 2) %>%
  arrange(fpr) %>%
  filter(fpr <= 0.1)
## # A tibble: 11 x 4
##      fpr   tpr threshold precision
##    <dbl> <dbl>     <dbl>     <dbl>
##  1  0     0.54      0.61      1   
##  2  0.01  0.63      0.73      0.98
##  3  0.02  0.68      0.79      0.97
##  4  0.03  0.72      0.83      0.96
##  5  0.04  0.74      0.85      0.95
##  6  0.05  0.77      0.87      0.94
##  7  0.06  0.8       0.88      0.93
##  8  0.07  0.84      0.9       0.92
##  9  0.08  0.86      0.91      0.92
## 10  0.09  0.88      0.92      0.91
## 11  0.1   0.88      0.92      0.9

XGBoost

xgboost_recipe <- training_df %>%
  recipe(fraudulent ~.) %>%
  update_role(job_id, new_role = "IGNORE") %>%
  step_nzv(all_predictors()) %>%
  step_unknown(all_nominal_predictors()) %>%
  step_impute_mean(all_numeric_predictors()) %>%
  step_tokenize(title, company_profile, description, requirements, benefits) %>%
  step_stopwords(title, company_profile, description, requirements, benefits) %>%
  step_tokenfilter(title, company_profile, description, requirements, benefits, max_tokens = 5) %>%
  step_tf(title, company_profile, description, requirements, benefits) %>%
  step_other(all_nominal_predictors(), threshold = 0.05) %>%
  step_string2factor(all_nominal_predictors()) %>%
  step_dummy(all_nominal_predictors())


xgboost_model <- boost_tree(trees = 5,
                            tree_depth = tune(),
                            learn_rate = tune()) %>%
  set_engine("xgboost") %>%
  set_mode("classification")

xgboost_wf <- workflow() %>%
  add_recipe(xgboost_recipe) %>%
  add_model(xgboost_model)

xgboost_grid <- grid_regular(
  tree_depth(),
  learn_rate(),
  levels=5)

all_cores <- detectCores(logical = FALSE)
cl <- makeCluster(all_cores)
doParallel::registerDoParallel(cl)

xgboost_tuning_results <- xgboost_wf %>%
  tune_grid(
    resamples = train_fold,
    grid = xgboost_grid
  )

xgboost_model <- xgboost_wf %>%
  finalize_workflow(select_best(xgboost_tuning_results))

xgboost_fit <- fit(xgboost_model, train)
## [20:40:58] WARNING: amalgamation/../src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.

Tuning metrics Scores - Table

xgboost_tuning_results %>%
  collect_metrics() %>%
  mutate_if(is.numeric, round, 3) %>%
  select(.metric, tree_depth, learn_rate, mean, std_err) %>%
  filter(.metric == "roc_auc") %>%
  arrange(desc(mean))
## # A tibble: 25 x 5
##    .metric tree_depth learn_rate  mean std_err
##    <chr>        <dbl>      <dbl> <dbl>   <dbl>
##  1 roc_auc         11      0.1   0.911   0.006
##  2 roc_auc         15      0.1   0.911   0.007
##  3 roc_auc          8      0.1   0.91    0.006
##  4 roc_auc          4      0.1   0.89    0.004
##  5 roc_auc         15      0     0.884   0.009
##  6 roc_auc         11      0.001 0.884   0.011
##  7 roc_auc         15      0.001 0.884   0.01 
##  8 roc_auc          8      0     0.883   0.01 
##  9 roc_auc         11      0     0.883   0.01 
## 10 roc_auc          8      0.001 0.883   0.011
## # ... with 15 more rows

Tuning metrics Scores - Effects of “tree_depth”

xgboost_tuning_results %>%
  collect_metrics() %>%
  mutate_if(is.numeric, round, 3) %>%
  select(.metric, tree_depth, learn_rate, mean, std_err) %>%
  ggplot(aes(tree_depth, mean)) +
  geom_line() +
  facet_wrap(~.metric, scales = "free", nrow = 2)

Tuning metrics Scores - Effect of “learn_rate”

xgboost_tuning_results %>%
  collect_metrics() %>%
  mutate_if(is.numeric, round, 3) %>%
  select(.metric, tree_depth, learn_rate, mean, std_err) %>%
  ggplot(aes(learn_rate, mean)) +
  geom_line() +
  facet_wrap(~.metric, scales = "free", nrow = 2)

Fit model metrics

options(yardstick.event_first = FALSE)
xgboost_train_scored <- predict(xgboost_fit, train, type = "prob") %>%
  bind_cols(predict(xgboost_fit, train)) %>%
  bind_cols(train) %>%
  mutate(model = "xgboost", scored_on = "train data") %>%
  select(job_id, model, scored_on, fraudulent, .pred_class, .pred_1)

xgboost_test_scored <- predict(xgboost_fit, test, type = "prob") %>%
  bind_cols(predict(xgboost_fit, test)) %>%
  bind_cols(test) %>%
  mutate(model = "xgboost", scored_on = "test data") %>%
  select(job_id, model, scored_on, fraudulent, .pred_class, .pred_1)

xgboost_scored <- xgboost_train_scored %>%
  bind_rows(xgboost_test_scored)

Important variables

xgboost_fit %>%
  pull_workflow_fit() %>%
  vip(10)

Fit model metrics - ROC_AUC Curve

xgboost_scored %>%
  group_by(scored_on) %>%
  roc_curve(fraudulent, .pred_1) %>%
  autoplot() +
  geom_vline(xintercept = 0.1, color="blue", size=0.5)

Metrics - accuracy, kap, mn_log_loss and roc_auc

xgboost_metrics_roc_logloss_etc <- xgboost_test_scored %>%
  metrics(fraudulent, .pred_1, estimate =.pred_class) %>%
  mutate(model = "xgboost", scored_on = "test data") %>%
  bind_rows(
    xgboost_train_scored %>%
      metrics(fraudulent, .pred_1, estimate =.pred_class) %>%
      mutate(model = "xgboost", scored_on = "train data")
  ) %>%
  group_by(model, scored_on) %>%
  pivot_wider(names_from = .metric, values_from =  .estimate) %>%
  select(model, .estimator, scored_on, accuracy, kap, mn_log_loss, roc_auc) %>%
  mutate_if(is.numeric, round, 2)

xgboost_metrics_roc_logloss_etc
## # A tibble: 2 x 7
## # Groups:   model, scored_on [2]
##   model   .estimator scored_on  accuracy   kap mn_log_loss roc_auc
##   <chr>   <chr>      <chr>         <dbl> <dbl>       <dbl>   <dbl>
## 1 xgboost binary     test data      0.97  0.55        0.39    0.9 
## 2 xgboost binary     train data     0.98  0.79        0.38    0.94

Metrics - precision and recall

xgboost_metrics_precision_and_recall <- xgboost_train_scored %>%
  precision(truth = fraudulent, estimate = .pred_class) %>%
  bind_rows(rf_train_scored %>%
              recall(truth = fraudulent, estimate = .pred_class)) %>%
  mutate(model = "xgboost", scored_on = "train data") %>%
  bind_rows(
    xgboost_test_scored %>%
      precision(truth = fraudulent, estimate = .pred_class) %>%
      bind_rows(rf_test_scored %>%
                  recall(truth = fraudulent, estimate = .pred_class)) %>%
      mutate(model = "xgboost", scored_on = "test data")  
  ) %>%
  group_by(model, scored_on) %>%
  pivot_wider(names_from = .metric, values_from = .estimate) %>%
  select(model, .estimator, scored_on, precision, recall) %>%
  mutate_if(is.numeric, round, 2)

xgboost_metrics_precision_and_recall
## # A tibble: 2 x 5
## # Groups:   model, scored_on [2]
##   model   .estimator scored_on  precision recall
##   <chr>   <chr>      <chr>          <dbl>  <dbl>
## 1 xgboost binary     train data      0.93   0.8 
## 2 xgboost binary     test data       0.71   0.46

Fit model metrics - threshold

rf_test_scored %>%
  roc_curve(fraudulent, .pred_1) %>%
  mutate(fpr = 1-round(specificity, 2),
         tpr = round(sensitivity, 3),
         threshold = 1 - round(.threshold, 3)) %>%
  select(fpr, tpr, threshold) %>%
  group_by(fpr) %>%
  summarize(tpr = max(tpr),
          threshold = max(threshold)) %>%
  ungroup() %>%
  mutate(precision = round(tpr/(fpr + tpr), 2)) %>%
  select(fpr, tpr, precision, threshold) %>%
  arrange(fpr) %>%
  filter(fpr <= 0.1)
## # A tibble: 11 x 4
##       fpr   tpr precision threshold
##     <dbl> <dbl>     <dbl>     <dbl>
##  1 0      0.536      1        0.612
##  2 0.0100 0.63       0.98     0.731
##  3 0.0200 0.68       0.97     0.789
##  4 0.0300 0.718      0.96     0.827
##  5 0.0400 0.74       0.95     0.847
##  6 0.0500 0.773      0.94     0.866
##  7 0.0600 0.796      0.93     0.883
##  8 0.0700 0.84       0.92     0.898
##  9 0.0800 0.862      0.92     0.908
## 10 0.09   0.878      0.91     0.916
## 11 0.1    0.884      0.9      0.92

Model Comparison

ROC_AUC curve

rf_scored %>%
  bind_rows(xgboost_scored) %>%
  filter(scored_on == "test data") %>%
  group_by(model)%>%
  roc_curve(fraudulent, .pred_1) %>%
  autoplot() +
  geom_vline(xintercept = 0.1, color = "blue", size = 0.5)

Metrics

precision_and_recall <- rf_metrics_precision_and_recall %>%
  bind_rows(xgboost_metrics_precision_and_recall)

rf_scored %>%
  bind_rows(xgboost_scored) %>%
  group_by(model, scored_on) %>%
  metrics(fraudulent, .pred_1, estimate = .pred_class ) %>%
  pivot_wider(names_from = .metric, values_from = .estimate) %>%
  select(model, .estimator, scored_on, accuracy, kap, mn_log_loss, roc_auc) %>%
  mutate_if(is.numeric, round, 3) %>%
  left_join(precision_and_recall) %>%
  select(model, scored_on, accuracy, kap, mn_log_loss, roc_auc, precision, recall)
## # A tibble: 4 x 8
##   model         scored_on  accuracy   kap mn_log_loss roc_auc precision recall
##   <chr>         <chr>         <dbl> <dbl>       <dbl>   <dbl>     <dbl>  <dbl>
## 1 random forest test data     0.973 0.607       0.101   0.96       0.95   0.46
## 2 random forest train data    0.99  0.883       0.037   1          1      0.8 
## 3 xgboost       test data     0.965 0.547       0.39    0.896      0.71   0.46
## 4 xgboost       train data    0.983 0.794       0.38    0.937      0.93   0.8