Fake Job Posting

options (scipen = 999)
library(tidyverse)
library(tidymodels)
library(tidytext)
library(janitor)
library(skimr)
library(vip)
library(parallel)  # - new 
library(doParallel) # - new 
library(rpart.plot)
library(textrecipes)
library(stringi)
library(xgboost)

job_holdout <- read.csv("job_holdout.csv")
job_training <- read.csv("job_training.csv")

job_training <- job_training %>% janitor::clean_names()

job_training %>% skimr::skim_without_charts()

Data summary
Name	Piped data
Number of rows	12516
Number of columns	18
_______________________
Column type frequency:
character	13
numeric	5
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
title	0	1.00	3	142	8069
location	234	0.98	2	161	2485
department	8070	0.36	1	34	732
salary_range	10537	0.16	3	16	700
company_profile	2328	0.81	14	5578	1514
description	0	1.00	3	22693	10564
requirements	1892	0.85	1	6356	8568
benefits	5057	0.60	2	4489	4643
employment_type	2403	0.81	5	9	5
required_experience	4955	0.60	8	16	7
required_education	5665	0.55	9	33	13
industry	3424	0.73	5	36	129
job_function	4487	0.64	5	22	37

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100
job_id	1	8940.07	5154.13	1	4483.5	8949.5	13384.25	17880
telecommuting	1	0.04	0.21	0	0.0	0.0	0.00	1
has_company_logo	1	0.79	0.40	0	1.0	1.0	1.00	1
has_questions	1	0.49	0.50	0	0.0	0.0	1.00	1
fraudulent	1	0.05	0.22	0	0.0	0.0	0.00	1

Clean data

Remove unimportant data

job_training <- job_training %>% select(-location)
job_holdout <- job_holdout %>% select(-location)

Replace variables with their binary variables

job_training <- job_training %>%
  mutate(has_salary_range = ifelse(is.na(salary_range), 0, 1))%>%
  select(-salary_range)

job_holdout <- job_holdout %>%
  mutate(has_salary_range = ifelse(is.na(salary_range), 0, 1))%>%
  select(-salary_range)


job_training <- job_training %>%
  mutate(has_department = ifelse(is.na(department), 0, 1)) %>%
  select(-department)

job_holdout <- job_holdout %>%
  mutate(has_department = ifelse(is.na(department), 0, 1)) %>%
  select(-department)

Work on the "Not applicables’

#---
job_training$employment_type[is.na(job_training$employment_type)] <- "Other"
job_holdout$employment_type[is.na(job_holdout$employment_type)] <- "Other"

#---
job_training$required_experience[is.na(job_training$required_experience)] <- "Not Applicable"
job_holdout$required_experience[is.na(job_holdout$required_experience)] <- "Not Applicable"

#---
job_training$required_education[is.na(job_training$required_education)] <- "Unspecified"
job_holdout$required_education[is.na(job_holdout$required_education)] <- "Unspecified"

Target encoding

####---
industry_fraud <- job_training %>%
  group_by(fraudulent, industry) %>%
  select(industry) %>%
  summarize(n=n()) %>%
  pivot_wider(names_from = fraudulent, values_from= n, values_fill = 0.0) %>%
  mutate(industry_pct_fraud = `1`/(`1`+`0`)) %>%
  select(industry, industry_pct_fraud)

job_training <- job_training %>%
  left_join(industry_fraud) %>%
  select(-industry)

job_holdout <- job_holdout %>%
  left_join(industry_fraud) %>%
  select(-industry)

remove(industry_fraud)


###---
job_function_fraud <- job_training %>%
  group_by(fraudulent, job_function) %>%
  summarize(n=n()) %>%
  pivot_wider(names_from = fraudulent, values_from = n, values_fill = 0.0) %>%
  mutate(jfunction_pct_fraud = `1`/(`0`+`1`)) %>%
  select(job_function, jfunction_pct_fraud)

job_training <- job_training %>%
  left_join(job_function_fraud) %>%
  select(-job_function)

job_holdout <- job_holdout %>%
  left_join(job_function_fraud) %>%
  select(-job_function)

remove(job_function_fraud)

Add number of words used to describe the company and the job position

job_training <- job_training %>%
  mutate(company_profile_words = ifelse(is.na(company_profile), 0, str_count(company_profile, boundary("word"))),description_words = 
      ifelse(is.na(description), 0, str_count(description, boundary("word"))),
    requirement_words =
      ifelse(is.na(requirements), 0, str_count(requirements, boundary("word"))),
    benefit_words = ifelse(is.na(benefits), 0, str_count(benefits, boundary("word"))))

  
job_holdout <- job_holdout %>%
  mutate(company_profile_words = ifelse(is.na(company_profile), 0, str_count(company_profile, boundary("word"))),description_words = 
      ifelse(is.na(description), 0, str_count(description, boundary("word"))),
    requirement_words =
      ifelse(is.na(requirements), 0, str_count(requirements, boundary("word"))),
    benefit_words = ifelse(is.na(benefits), 0, str_count(benefits, boundary("word"))))

Clean text data

job_training <- job_training %>%
  mutate(title = stri_replace_all_regex(title, '[^a-zA-Z0-9*$]', " ")) %>%
  mutate(company_profile = stri_replace_all_regex(company_profile, '[^a-zA-Z0-9*$]', " ")) %>%
  mutate(description = stri_replace_all_regex(description, '[^a-zA-Z0-9*$]', " ")) %>%
  mutate(requirements = stri_replace_all_regex(requirements, '[^a-zA-Z0-9*$]', " ")) %>%
  mutate(benefits = stri_replace_all_regex(benefits, '[^a-zA-Z0-9*$]', " "))

job_holdout <- job_holdout %>%
  mutate(title = stri_replace_all_regex(title, '[^a-zA-Z0-9*$]', " ")) %>%
  mutate(company_profile = stri_replace_all_regex(company_profile, '[^a-zA-Z0-9*$]', " ")) %>%
  mutate(description = stri_replace_all_regex(description, '[^a-zA-Z0-9*$]', " ")) %>%
  mutate(requirements = stri_replace_all_regex(requirements, '[^a-zA-Z0-9*$]', " ")) %>%
  mutate(benefits = stri_replace_all_regex(benefits, '[^a-zA-Z0-9*$]', " "))

to factor

training_df <- job_training %>%
  mutate_if(is.character, as_factor) %>%
  mutate(fraudulent = as.factor(fraudulent)) %>%
  mutate(telecommuting = as.factor(telecommuting)) %>%
  mutate(has_company_logo = as.factor(has_company_logo)) %>%
  mutate(has_questions = as.factor(has_questions)) %>%
  mutate(has_salary_range = as.factor(has_salary_range)) %>%
  mutate(has_department = as.factor(has_department))

holdout_df <- job_holdout %>%
  mutate_if(is.character, as_factor) %>%
  mutate(telecommuting = as.factor(telecommuting)) %>%
  mutate(has_company_logo = as.factor(has_company_logo)) %>%
  mutate(has_questions = as.factor(has_questions)) %>%
  mutate(has_salary_range = as.factor(has_salary_range)) %>%
  mutate(has_department = as.factor(has_department))

skimr::skim_without_charts(data=training_df)

Data summary
Name	training_df
Number of rows	12516
Number of columns	21
_______________________
Column type frequency:
factor	14
numeric	7
________________________
Group variables	None

Variable type: factor

skim_variable	n_missing	complete_rate	ordered	n_unique	top_counts
title	0	1.00	FALSE	8040	Eng: 293, Cus: 150, Gra: 103, Sof: 73
company_profile	2328	0.81	FALSE	1514	We : 528, We : 485, Nov: 416, Est: 321
description	0	1.00	FALSE	10563	Pla: 275, Pla: 49, Pla: 32, Pla: 24
requirements	1892	0.85	FALSE	8567	Uni: 296, Uni: 117, 16 : 77, Min: 60
benefits	5057	0.60	FALSE	4639	See: 528, Car: 104, CSD: 50, Fut: 44
telecommuting	0	1.00	FALSE	2	0: 11962, 1: 554
has_company_logo	0	1.00	FALSE	2	1: 9939, 0: 2577
has_questions	0	1.00	FALSE	2	0: 6348, 1: 6168
employment_type	0	1.00	FALSE	5	Ful: 8127, Oth: 2572, Con: 1086, Par: 564
required_experience	0	1.00	FALSE	7	Not: 5738, Mid: 2651, Ent: 1923, Ass: 1568
required_education	0	1.00	FALSE	13	Uns: 6641, Bac: 3600, Hig: 1471, Mas: 285
fraudulent	0	1.00	FALSE	2	0: 11899, 1: 617
has_salary_range	0	1.00	FALSE	2	0: 10537, 1: 1979
has_department	0	1.00	FALSE	2	0: 8070, 1: 4446

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100
job_id	1	8940.07	5154.13	1	4483.50	8949.50	13384.25	17880.00
industry_pct_fraud	1	0.05	0.07	0	0.00	0.05	0.06	1.00
jfunction_pct_fraud	1	0.05	0.04	0	0.02	0.05	0.05	0.18
company_profile_words	1	96.09	92.80	0	20.00	86.00	138.00	891.00
description_words	1	186.19	226.97	1	90.00	150.00	237.00	10607.00
requirement_words	1	82.94	88.38	0	19.00	65.00	115.00	1001.00
benefit_words	1	31.92	54.85	0	0.00	6.00	44.00	730.00

Split Data

set.seed(42)
df_split <- initial_split(data = training_df, prop = 0.7, strata = fraudulent)

train <- training(df_split)
test <- testing(df_split)

set.seed(42)
train_fold <- vfold_cv(data = train, v = 5)

Data Exploration

Target Variable - bar graph

job_training %>%
  ggplot(aes(fraudulent)) +
  geom_bar(stat = "count") +
  labs(title = "Distribution of Fraudulent Jobs")

Target variable - table

job_training %>%
  group_by(fraudulent) %>%
  summarize(n=n()) %>%
  mutate(pct = round(n/sum(n), 2))

## # A tibble: 2 x 3
##   fraudulent     n   pct
##        <int> <int> <dbl>
## 1          0 11899  0.95
## 2          1   617  0.05

Correlation

job_training %>%
  select(where(is.numeric), -job_id) %>%
  lares::corr_var(fraudulent)

Numeric Data

plot_numeric <- function(col){
  training_df %>%
    ggplot(aes(x=fraudulent, y=!!as.name(col))) +
    geom_boxplot()
}

for (column in names(training_df %>% select(where(is.numeric), -job_id))){
  print(plot_numeric(column))
}

Numeric Data - Percentage fraud by category

Industry percentage fraud and job function percentage fraud

training_df %>%
  select(fraudulent, where(is.numeric), -job_id) %>%
  group_by(fraudulent) %>%
  summarize(industry_pct = round(mean(industry_pct_fraud), 2),
            job_function_pct = round(mean(jfunction_pct_fraud), 2)) %>%
  ungroup() %>%
  mutate_if(is.numeric, round, 2)

## # A tibble: 2 x 3
##   fraudulent industry_pct job_function_pct
##   <fct>             <dbl>            <dbl>
## 1 0                  0.04             0.05
## 2 1                  0.14             0.08

Number of words used to describe company profile, job description job requirements, and benefits

training_df %>%
  select(fraudulent, where(is.numeric), -job_id) %>%
  group_by(fraudulent) %>%
  summarize(company_profile = round(mean(company_profile_words), 0),
            description = round(mean(description_words), 0),
            requirement = round(mean(requirement_words), 0),
            benefit = round(mean(benefit_words), 0)) %>%
  ungroup() %>%
  mutate_if(is.numeric, round, 2)

## # A tibble: 2 x 5
##   fraudulent company_profile description requirement benefit
##   <fct>                <dbl>       <dbl>       <dbl>   <dbl>
## 1 0                       99         187          84      32
## 2 1                       33         170          64      32

factor Data

plot_factor <- function(col){
  training_df %>%
    ggplot(., aes(!!as.name(col))) +
    geom_bar(aes(fill = fraudulent), position = "fill") +
    coord_flip()
}

for(column in names(training_df %>%
                    select(telecommuting, has_company_logo, has_questions,
                           employment_type, required_experience, required_education,
                           has_salary_range,has_department)))
{
  print(plot_factor(column))
}

Factor data - percentage fraud by category

Telecommuting

training_df %>%
  select(where(is.factor)) %>%
  group_by(fraudulent, telecommuting) %>%
  summarize(n=n()) %>%
  pivot_wider(names_from = fraudulent, values_from = n) %>%
  mutate(pct = round(`1`/(`0`+`1`), 2), legit = `0`, fraud = `1`) %>%
  select(telecommuting, legit, fraud, pct) %>%
  arrange(desc(pct))

## # A tibble: 2 x 4
##   telecommuting legit fraud   pct
##   <fct>         <int> <int> <dbl>
## 1 1               508    46  0.08
## 2 0             11391   571  0.05

has_company_logo

training_df %>%
  select(where(is.factor)) %>%
  group_by(fraudulent, has_company_logo) %>%
  summarize(n=n()) %>%
  pivot_wider(names_from = fraudulent, values_from = n) %>%
  mutate(pct = round(`1`/(`0`+`1`), 2), legit = `0`, fraud = `1`) %>%
  select(has_company_logo, legit, fraud, pct)%>%
  arrange(desc(pct))

## # A tibble: 2 x 4
##   has_company_logo legit fraud   pct
##   <fct>            <int> <int> <dbl>
## 1 0                 2169   408  0.16
## 2 1                 9730   209  0.02

has_questions

training_df %>%
  select(where(is.factor)) %>%
  group_by(fraudulent, has_questions) %>%
  summarize(n=n()) %>%
  pivot_wider(names_from = fraudulent, values_from = n) %>%
  mutate(pct = round(`1`/(`0`+`1`), 2), legit = `0`, fraud = `1`) %>%
  select(has_questions, legit, fraud, pct)%>%
  arrange(desc(pct))

## # A tibble: 2 x 4
##   has_questions legit fraud   pct
##   <fct>         <int> <int> <dbl>
## 1 0              5915   433  0.07
## 2 1              5984   184  0.03

employment_type

training_df %>%
  select(where(is.factor)) %>%
  group_by(fraudulent, employment_type) %>%
  summarize(n=n()) %>%
  pivot_wider(names_from = fraudulent, values_from = n) %>%
  mutate(pct = round(`1`/(`0`+`1`), 2), legit = `0`, fraud = `1`) %>%
  select(employment_type, legit, fraud, pct)%>%
  arrange(desc(pct))

## # A tibble: 5 x 4
##   employment_type legit fraud   pct
##   <fct>           <int> <int> <dbl>
## 1 Part-time         512    52  0.09
## 2 Other            2391   181  0.07
## 3 Full-time        7774   353  0.04
## 4 Contract         1056    30  0.03
## 5 Temporary         166     1  0.01

required_experience

training_df %>%
  select(where(is.factor)) %>%
  group_by(fraudulent, required_experience) %>%
  summarize(n=n()) %>%
  pivot_wider(names_from = fraudulent, values_from = n) %>%
  mutate(pct = round(`1`/(`0`+`1`), 2), legit = `0`, fraud = `1`) %>%
  select(required_experience, legit, fraud, pct)%>%
  arrange(desc(pct))

## # A tibble: 7 x 4
##   required_experience legit fraud   pct
##   <fct>               <int> <int> <dbl>
## 1 Executive              88     8  0.08
## 2 Entry level          1798   125  0.07
## 3 Not Applicable       5385   353  0.06
## 4 Director              262    15  0.05
## 5 Internship            256     7  0.03
## 6 Mid-Senior level     2566    85  0.03
## 7 Associate            1544    24  0.02

required_education

training_df %>%
  select(where(is.factor)) %>%
  group_by(fraudulent, required_education) %>%
  summarize(n=n()) %>%
  pivot_wider(names_from = fraudulent, values_from = n) %>%
  mutate(pct = round(`1`/(`0`+`1`), 2), legit = `0`, fraud = `1`) %>%
  select(required_education, legit, fraud, pct)%>%
  arrange(desc(pct))

## # A tibble: 13 x 4
##    required_education                legit fraud   pct
##    <fct>                             <int> <int> <dbl>
##  1 Some High School Coursework           5    15  0.75
##  2 Certification                       105    12  0.1 
##  3 Master's Degree                     258    27  0.09
##  4 High School or equivalent          1354   117  0.08
##  5 Doctorate                            17     1  0.06
##  6 Unspecified                        6280   361  0.05
##  7 Professional                         59     3  0.05
##  8 Some College Coursework Completed    62     2  0.03
##  9 Associate Degree                    186     5  0.03
## 10 Bachelor's Degree                  3526    74  0.02
## 11 Vocational                           36    NA NA   
## 12 Vocational - Degree                   3    NA NA   
## 13 Vocational - HS Diploma               8    NA NA

has_salary_range

training_df %>%
  select(where(is.factor)) %>%
  group_by(fraudulent, has_salary_range) %>%
  summarize(n=n()) %>%
  pivot_wider(names_from = fraudulent, values_from = n) %>%
  mutate(pct = round(`1`/(`0`+`1`), 2), legit = `0`, fraud = `1`) %>%
  select(has_salary_range, legit, fraud, pct)%>%
  arrange(desc(pct))

## # A tibble: 2 x 4
##   has_salary_range legit fraud   pct
##   <fct>            <int> <int> <dbl>
## 1 1                 1822   157  0.08
## 2 0                10077   460  0.04

Model Creation

Random Forest

training_recipe <- training_df %>%
  recipe(fraudulent ~.) %>%
  update_role(job_id, new_role = "IGNORE") %>%
  step_unknown(all_nominal_predictors()) %>%
  step_impute_mean(all_numeric_predictors()) %>%
  step_tokenize(title, company_profile, description, requirements, benefits) %>%
  step_stopwords(title, company_profile, description, requirements, benefits) %>%
  step_tokenfilter(title, company_profile, description, requirements, benefits, max_tokens = 5) %>%
  step_tf(title, company_profile, description, requirements, benefits) %>%
  step_other(all_nominal_predictors(), threshold = 0.01)


rf_model <- rand_forest(min_n = tune(), trees = tune()) %>%
  set_mode("classification") %>%
  set_engine("ranger", importance = "permutation")

rf_workflow <- workflow() %>%
  add_recipe(training_recipe) %>%
  add_model(rf_model)

all_cores <- detectCores(logical = FALSE)
cl <- makeCluster(all_cores)
doParallel::registerDoParallel(cl)
rf_grid <- grid_regular(
  min_n(),
  trees(c(5,20)),
  levels = 5
)

rf_tuning_results <- rf_workflow %>%
  tune_grid(
    resamples = train_fold,
    grid = rf_grid
    )

rf_model <- rf_workflow %>%
  finalize_workflow(select_best(rf_tuning_results))

rf_fit <- fit(rf_model, train)

Tuning Metrics

Tuning metrics - Table

rf_tuning_results_table <- rf_tuning_results %>%
  collect_metrics()%>%
  mutate_if(is.numeric, round, 3)%>%
  select(.metric, trees, min_n, mean, std_err) %>%
  arrange(desc(mean))

rf_tuning_results_table %>%
  filter(.metric == "roc_auc")

## # A tibble: 25 x 5
##    .metric trees min_n  mean std_err
##    <chr>   <dbl> <dbl> <dbl>   <dbl>
##  1 roc_auc    20    11 0.969   0.002
##  2 roc_auc    20    30 0.965   0.004
##  3 roc_auc    20    40 0.964   0.002
##  4 roc_auc    16    11 0.962   0.004
##  5 roc_auc    16    30 0.962   0.002
##  6 roc_auc    12    21 0.961   0.004
##  7 roc_auc    16    21 0.961   0.004
##  8 roc_auc    16    40 0.961   0.003
##  9 roc_auc    20    21 0.961   0.003
## 10 roc_auc    12    40 0.959   0.002
## # ... with 15 more rows

Tuning metrics - Effect of “trees”

rf_tuning_results_table %>%
  ggplot(aes(trees, mean)) +
  geom_line() +
  facet_wrap(~.metric, scales="free", nrow = 2)

Tuning metrics - Effect of “min_n”

rf_tuning_results_table %>%
  ggplot(aes(min_n, mean)) +
  geom_line() +
  facet_wrap(~.metric, scales="free", nrow = 2)

Fit model metrics

Fit model metrics - Scores

options(yardstick.event_first = FALSE)
rf_train_scored <- predict(rf_fit, train, type = "prob") %>%
  bind_cols(predict(rf_fit, train)) %>%
  bind_cols(train) %>%
  mutate(model = "random forest", scored_on = "train data") %>%
  select(job_id, model, scored_on, fraudulent, .pred_class, .pred_1)

rf_test_scored <- predict(rf_fit, test, type = "prob") %>%
  bind_cols(predict(rf_fit, test)) %>%
  bind_cols(test) %>%
  mutate(model = "random forest", scored_on = "test data") %>%
  select(job_id, model, scored_on, fraudulent, .pred_class, .pred_1)

rf_scored <- rf_train_scored %>%
  bind_rows(rf_test_scored)

Fit model metrics - important variables

rf_fit %>%
  pull_workflow_fit() %>%
  vip(10)

Fit model metrics - Classification Matrix

rf_train_scored %>%
  conf_mat(fraudulent, .pred_class) %>%
  autoplot(type="heatmap") +
  labs(title = "Classification Matrix - Train Data")

rf_test_scored %>%
  conf_mat(fraudulent, .pred_class) %>%
  autoplot(type="heatmap") +
  labs(title = "Classification Matrix - Test Data")

Fit model metrics - ROC_AUC Curve

rf_scored %>%
  group_by(scored_on) %>%
  roc_curve(fraudulent, .pred_1) %>%
  autoplot() +
  geom_vline(xintercept = 0.1, color="blue", size=0.5)

Metrics - accuracy, kap, mn_log_loss and roc_auc

rf_metrics_roc_logloss_etc <- rf_test_scored %>%
  metrics(fraudulent, .pred_1, estimate =.pred_class) %>%
  mutate(model = "random forest", scored_on = "test data") %>%
  bind_rows(
    rf_train_scored %>%
      metrics(fraudulent, .pred_1, estimate =.pred_class) %>%
      mutate(model = "random forest", scored_on = "train data")
  ) %>%
  group_by(model, scored_on) %>%
  pivot_wider(names_from = .metric, values_from =  .estimate) %>%
  select(model, .estimator, scored_on, accuracy, kap, mn_log_loss, roc_auc) %>%
  mutate_if(is.numeric, round, 2)

rf_metrics_roc_logloss_etc

## # A tibble: 2 x 7
## # Groups:   model, scored_on [2]
##   model         .estimator scored_on  accuracy   kap mn_log_loss roc_auc
##   <chr>         <chr>      <chr>         <dbl> <dbl>       <dbl>   <dbl>
## 1 random forest binary     test data      0.97  0.61        0.1     0.96
## 2 random forest binary     train data     0.99  0.88        0.04    1

Metrics - precision and recall

rf_metrics_precision_and_recall <- rf_train_scored %>%
  precision(truth = fraudulent, estimate = .pred_class) %>%
  bind_rows(rf_train_scored %>%
              recall(truth = fraudulent, estimate = .pred_class)) %>%
  mutate(model = "random forest", scored_on = "train data") %>%
  bind_rows(
    rf_test_scored %>%
      precision(truth = fraudulent, estimate = .pred_class) %>%
      bind_rows(rf_test_scored %>%
                  recall(truth = fraudulent, estimate = .pred_class)) %>%
      mutate(model = "random forest", scored_on = "test data")  
  ) %>%
  group_by(model, scored_on) %>%
  pivot_wider(names_from = .metric, values_from = .estimate) %>%
  select(model, .estimator, scored_on, precision, recall) %>%
  mutate_if(is.numeric, round, 2)

rf_metrics_precision_and_recall

## # A tibble: 2 x 5
## # Groups:   model, scored_on [2]
##   model         .estimator scored_on  precision recall
##   <chr>         <chr>      <chr>          <dbl>  <dbl>
## 1 random forest binary     train data      1      0.8 
## 2 random forest binary     test data       0.95   0.46

Fit model metrics - Threshold

rf_test_scored %>%
  roc_curve(fraudulent, .pred_1) %>%
  mutate(fpr = 1-round(specificity, 2),
         tpr = round(sensitivity, 3),
         threshold = 1 - round(.threshold, 3)) %>%
  select(fpr, tpr, threshold) %>%
  group_by(fpr) %>%
  summarize(tpr = max(tpr),
          threshold = max(threshold)) %>%
  ungroup() %>%
  mutate(precision = tpr/(fpr + tpr)) %>%
  mutate_if(is.numeric, round, 2) %>%
  arrange(fpr) %>%
  filter(fpr <= 0.1)

## # A tibble: 11 x 4
##      fpr   tpr threshold precision
##    <dbl> <dbl>     <dbl>     <dbl>
##  1  0     0.54      0.61      1   
##  2  0.01  0.63      0.73      0.98
##  3  0.02  0.68      0.79      0.97
##  4  0.03  0.72      0.83      0.96
##  5  0.04  0.74      0.85      0.95
##  6  0.05  0.77      0.87      0.94
##  7  0.06  0.8       0.88      0.93
##  8  0.07  0.84      0.9       0.92
##  9  0.08  0.86      0.91      0.92
## 10  0.09  0.88      0.92      0.91
## 11  0.1   0.88      0.92      0.9

XGBoost

xgboost_recipe <- training_df %>%
  recipe(fraudulent ~.) %>%
  update_role(job_id, new_role = "IGNORE") %>%
  step_nzv(all_predictors()) %>%
  step_unknown(all_nominal_predictors()) %>%
  step_impute_mean(all_numeric_predictors()) %>%
  step_tokenize(title, company_profile, description, requirements, benefits) %>%
  step_stopwords(title, company_profile, description, requirements, benefits) %>%
  step_tokenfilter(title, company_profile, description, requirements, benefits, max_tokens = 5) %>%
  step_tf(title, company_profile, description, requirements, benefits) %>%
  step_other(all_nominal_predictors(), threshold = 0.05) %>%
  step_string2factor(all_nominal_predictors()) %>%
  step_dummy(all_nominal_predictors())


xgboost_model <- boost_tree(trees = 5,
                            tree_depth = tune(),
                            learn_rate = tune()) %>%
  set_engine("xgboost") %>%
  set_mode("classification")

xgboost_wf <- workflow() %>%
  add_recipe(xgboost_recipe) %>%
  add_model(xgboost_model)

xgboost_grid <- grid_regular(
  tree_depth(),
  learn_rate(),
  levels=5)

all_cores <- detectCores(logical = FALSE)
cl <- makeCluster(all_cores)
doParallel::registerDoParallel(cl)

xgboost_tuning_results <- xgboost_wf %>%
  tune_grid(
    resamples = train_fold,
    grid = xgboost_grid
  )

xgboost_model <- xgboost_wf %>%
  finalize_workflow(select_best(xgboost_tuning_results))

xgboost_fit <- fit(xgboost_model, train)

## [20:40:58] WARNING: amalgamation/../src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.

Tuning metrics Scores - Table

xgboost_tuning_results %>%
  collect_metrics() %>%
  mutate_if(is.numeric, round, 3) %>%
  select(.metric, tree_depth, learn_rate, mean, std_err) %>%
  filter(.metric == "roc_auc") %>%
  arrange(desc(mean))

## # A tibble: 25 x 5
##    .metric tree_depth learn_rate  mean std_err
##    <chr>        <dbl>      <dbl> <dbl>   <dbl>
##  1 roc_auc         11      0.1   0.911   0.006
##  2 roc_auc         15      0.1   0.911   0.007
##  3 roc_auc          8      0.1   0.91    0.006
##  4 roc_auc          4      0.1   0.89    0.004
##  5 roc_auc         15      0     0.884   0.009
##  6 roc_auc         11      0.001 0.884   0.011
##  7 roc_auc         15      0.001 0.884   0.01 
##  8 roc_auc          8      0     0.883   0.01 
##  9 roc_auc         11      0     0.883   0.01 
## 10 roc_auc          8      0.001 0.883   0.011
## # ... with 15 more rows

Tuning metrics Scores - Effects of “tree_depth”

xgboost_tuning_results %>%
  collect_metrics() %>%
  mutate_if(is.numeric, round, 3) %>%
  select(.metric, tree_depth, learn_rate, mean, std_err) %>%
  ggplot(aes(tree_depth, mean)) +
  geom_line() +
  facet_wrap(~.metric, scales = "free", nrow = 2)

Tuning metrics Scores - Effect of “learn_rate”

xgboost_tuning_results %>%
  collect_metrics() %>%
  mutate_if(is.numeric, round, 3) %>%
  select(.metric, tree_depth, learn_rate, mean, std_err) %>%
  ggplot(aes(learn_rate, mean)) +
  geom_line() +
  facet_wrap(~.metric, scales = "free", nrow = 2)

Fit model metrics

options(yardstick.event_first = FALSE)
xgboost_train_scored <- predict(xgboost_fit, train, type = "prob") %>%
  bind_cols(predict(xgboost_fit, train)) %>%
  bind_cols(train) %>%
  mutate(model = "xgboost", scored_on = "train data") %>%
  select(job_id, model, scored_on, fraudulent, .pred_class, .pred_1)

xgboost_test_scored <- predict(xgboost_fit, test, type = "prob") %>%
  bind_cols(predict(xgboost_fit, test)) %>%
  bind_cols(test) %>%
  mutate(model = "xgboost", scored_on = "test data") %>%
  select(job_id, model, scored_on, fraudulent, .pred_class, .pred_1)

xgboost_scored <- xgboost_train_scored %>%
  bind_rows(xgboost_test_scored)

Important variables

xgboost_fit %>%
  pull_workflow_fit() %>%
  vip(10)

Fit model metrics - ROC_AUC Curve

xgboost_scored %>%
  group_by(scored_on) %>%
  roc_curve(fraudulent, .pred_1) %>%
  autoplot() +
  geom_vline(xintercept = 0.1, color="blue", size=0.5)

Metrics - accuracy, kap, mn_log_loss and roc_auc

xgboost_metrics_roc_logloss_etc <- xgboost_test_scored %>%
  metrics(fraudulent, .pred_1, estimate =.pred_class) %>%
  mutate(model = "xgboost", scored_on = "test data") %>%
  bind_rows(
    xgboost_train_scored %>%
      metrics(fraudulent, .pred_1, estimate =.pred_class) %>%
      mutate(model = "xgboost", scored_on = "train data")
  ) %>%
  group_by(model, scored_on) %>%
  pivot_wider(names_from = .metric, values_from =  .estimate) %>%
  select(model, .estimator, scored_on, accuracy, kap, mn_log_loss, roc_auc) %>%
  mutate_if(is.numeric, round, 2)

xgboost_metrics_roc_logloss_etc

## # A tibble: 2 x 7
## # Groups:   model, scored_on [2]
##   model   .estimator scored_on  accuracy   kap mn_log_loss roc_auc
##   <chr>   <chr>      <chr>         <dbl> <dbl>       <dbl>   <dbl>
## 1 xgboost binary     test data      0.97  0.55        0.39    0.9 
## 2 xgboost binary     train data     0.98  0.79        0.38    0.94

Metrics - precision and recall

xgboost_metrics_precision_and_recall <- xgboost_train_scored %>%
  precision(truth = fraudulent, estimate = .pred_class) %>%
  bind_rows(rf_train_scored %>%
              recall(truth = fraudulent, estimate = .pred_class)) %>%
  mutate(model = "xgboost", scored_on = "train data") %>%
  bind_rows(
    xgboost_test_scored %>%
      precision(truth = fraudulent, estimate = .pred_class) %>%
      bind_rows(rf_test_scored %>%
                  recall(truth = fraudulent, estimate = .pred_class)) %>%
      mutate(model = "xgboost", scored_on = "test data")  
  ) %>%
  group_by(model, scored_on) %>%
  pivot_wider(names_from = .metric, values_from = .estimate) %>%
  select(model, .estimator, scored_on, precision, recall) %>%
  mutate_if(is.numeric, round, 2)

xgboost_metrics_precision_and_recall

## # A tibble: 2 x 5
## # Groups:   model, scored_on [2]
##   model   .estimator scored_on  precision recall
##   <chr>   <chr>      <chr>          <dbl>  <dbl>
## 1 xgboost binary     train data      0.93   0.8 
## 2 xgboost binary     test data       0.71   0.46

Fit model metrics - threshold

rf_test_scored %>%
  roc_curve(fraudulent, .pred_1) %>%
  mutate(fpr = 1-round(specificity, 2),
         tpr = round(sensitivity, 3),
         threshold = 1 - round(.threshold, 3)) %>%
  select(fpr, tpr, threshold) %>%
  group_by(fpr) %>%
  summarize(tpr = max(tpr),
          threshold = max(threshold)) %>%
  ungroup() %>%
  mutate(precision = round(tpr/(fpr + tpr), 2)) %>%
  select(fpr, tpr, precision, threshold) %>%
  arrange(fpr) %>%
  filter(fpr <= 0.1)

## # A tibble: 11 x 4
##       fpr   tpr precision threshold
##     <dbl> <dbl>     <dbl>     <dbl>
##  1 0      0.536      1        0.612
##  2 0.0100 0.63       0.98     0.731
##  3 0.0200 0.68       0.97     0.789
##  4 0.0300 0.718      0.96     0.827
##  5 0.0400 0.74       0.95     0.847
##  6 0.0500 0.773      0.94     0.866
##  7 0.0600 0.796      0.93     0.883
##  8 0.0700 0.84       0.92     0.898
##  9 0.0800 0.862      0.92     0.908
## 10 0.09   0.878      0.91     0.916
## 11 0.1    0.884      0.9      0.92

Model Comparison

ROC_AUC curve

rf_scored %>%
  bind_rows(xgboost_scored) %>%
  filter(scored_on == "test data") %>%
  group_by(model)%>%
  roc_curve(fraudulent, .pred_1) %>%
  autoplot() +
  geom_vline(xintercept = 0.1, color = "blue", size = 0.5)

Metrics

precision_and_recall <- rf_metrics_precision_and_recall %>%
  bind_rows(xgboost_metrics_precision_and_recall)

rf_scored %>%
  bind_rows(xgboost_scored) %>%
  group_by(model, scored_on) %>%
  metrics(fraudulent, .pred_1, estimate = .pred_class ) %>%
  pivot_wider(names_from = .metric, values_from = .estimate) %>%
  select(model, .estimator, scored_on, accuracy, kap, mn_log_loss, roc_auc) %>%
  mutate_if(is.numeric, round, 3) %>%
  left_join(precision_and_recall) %>%
  select(model, scored_on, accuracy, kap, mn_log_loss, roc_auc, precision, recall)

## # A tibble: 4 x 8
##   model         scored_on  accuracy   kap mn_log_loss roc_auc precision recall
##   <chr>         <chr>         <dbl> <dbl>       <dbl>   <dbl>     <dbl>  <dbl>
## 1 random forest test data     0.973 0.607       0.101   0.96       0.95   0.46
## 2 random forest train data    0.99  0.883       0.037   1          1      0.8 
## 3 xgboost       test data     0.965 0.547       0.39    0.896      0.71   0.46
## 4 xgboost       train data    0.983 0.794       0.38    0.937      0.93   0.8