predictive_modeling_ctr/ctr_pred_model.R at main · sallyp0318/predictive_modeling_ctr · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
# Set working directory (adjust to your own path)
setwd("/Users/sallypark/Library/Mobile Documents/com~apple~CloudDocs/Columbia/Fall 2024/APAN5200 Framework and Methods I/Data")

# Step 1: Set up environment
library(dplyr)
library(tidyr)
library(caret)
library(xgboost)
library(randomForest)
library(e1071) # For SVM

# Load the data
analysis_data <- read.csv('analysis_data.csv')
scoring_data <- read.csv('scoring_data.csv')

# Step 2: Separate Target Variable and Prepare for Imputation
target <- analysis_data$CTR
analysis_data <- analysis_data |> select(-CTR)  # Remove CTR for data processing

# Step 3: Impute Missing Values
# Apply imputation
analysis_data <- analysis_data |>
  mutate(across(where(is.numeric), ~ ifelse(is.na(.), mean(., na.rm = TRUE), .))) |>
  mutate(across(where(is.character), ~ ifelse(is.na(.), get_mode(.), .)))

scoring_data <- scoring_data |>
  mutate(across(where(is.numeric), ~ ifelse(is.na(.), mean(., na.rm = TRUE), .))) |>
  mutate(across(where(is.character), ~ ifelse(is.na(.), get_mode(.), .)))

# Convert categorical columns to factors
categorical_cols <- c("position_on_page", "ad_format", "age_group", "gender", "location", "time_of_day", "day_of_week", "device_type")
analysis_data[categorical_cols] <- lapply(analysis_data[categorical_cols], as.factor)
scoring_data[categorical_cols] <- lapply(scoring_data[categorical_cols], as.factor)

# 3b. Decision Tree Imputation for Categorical Columns
impute_with_decision_tree <- function(data) {
  for (col in names(data)) {
    if (is.factor(data[[col]]) && anyNA(data[[col]])) {
      # Train decision tree model to predict missing values for the column
      model <- rpart(as.formula(paste(col, "~ .")), data = data, na.action = na.omit)
      data[[col]][is.na(data[[col]])] <- predict(model, data[is.na(data[[col]]), ], type = "class")
    }
  }
  return(data)
}

# Apply decision tree imputation to categorical columns
analysis_data <- impute_with_decision_tree(analysis_data)
scoring_data <- impute_with_decision_tree(scoring_data)

colSums(is.na(analysis_data))

# Step 4: Feature Engineering

# Feature engineering based on ad exposure and engagement
analysis_data <- analysis_data |>
  mutate(
    audience_fatigue_index = ad_frequency * market_saturation,
    engagement_potential = ad_frequency / (1 + market_saturation^2),
    body_char_per_word = ifelse(body_word_count == 0, 0, body_text_length / body_word_count),
    relevance_score = targeting_score + contextual_relevance,
    ad_strength = visual_appeal + cta_strength
  )

scoring_data <- scoring_data |>
  mutate(
    audience_fatigue_index = ad_frequency * market_saturation,
    engagement_potential = ad_frequency / (1 + market_saturation^2),
    body_char_per_word = ifelse(body_word_count == 0, 0, body_text_length / body_word_count),
    relevance_score = targeting_score + contextual_relevance,
    ad_strength = visual_appeal + cta_strength
  )

# Step 5: Encoding and Scaling

# Define categorical columns
categorical_cols <- c("position_on_page", "ad_format", "age_group", "gender", "location",
                      "time_of_day", "day_of_week", "device_type")

# Convert categorical columns to factors
analysis_data[categorical_cols] <- lapply(analysis_data[categorical_cols], as.factor)
scoring_data[categorical_cols] <- lapply(scoring_data[categorical_cols], as.factor)

# Ensure consistent factor levels between analysis_data and scoring_data
for (col in categorical_cols) {
  levels(scoring_data[[col]]) <- levels(analysis_data[[col]])
}

# Dummy encoding for categorical variables
dummy_vars <- dummyVars("~ .", data = analysis_data)
analysis_data_encoded <- predict(dummy_vars, newdata = analysis_data) |> as.data.frame()
scoring_data_encoded <- predict(dummy_vars, newdata = scoring_data) |> as.data.frame()

# Step 6: Prepare Data for Modeling
# Add CTR back to analysis_data for modeling
analysis_data_encoded$CTR <- target

# Split into training and testing sets
set.seed(123)
trainIndex <- createDataPartition(analysis_data_encoded$CTR, p = 0.8, list = FALSE)
train_data <- analysis_data_encoded[trainIndex, ]
test_data <- analysis_data_encoded[-trainIndex, ]

# Step 7: Define Model Training Control
fitControl <- trainControl(method = "cv", number = 5, verboseIter = TRUE)

# Step 8: Train Multiple Models

# 8a. XGBoost
set.seed(123)
xgb_model <- train(
  CTR ~ ., data = train_data,
  method = "xgbTree",
  trControl = fitControl,
  metric = "RMSE"
)

# 8b. Random Forest
set.seed(123)
rf_model <- train(
  CTR ~ ., data = train_data,
  method = "rf",
  trControl = fitControl,
  metric = "RMSE"
)

# 8c. Decision Tree
#set.seed(123)
#dt_model <- train(CTR ~ ., data = train_data,method = "rpart",trControl = fitControl,metric = "RMSE")

# 8d. Linear Regression
set.seed(123)
lm_model <- train(
  CTR ~ ., data = train_data,
  method = "lm",
  trControl = fitControl,
  metric = "RMSE"
)

# 8e. Support Vector Machine (SVM)
set.seed(123)
svm_model <- train(
  CTR ~ ., data = train_data,
  method = "svmRadial",
  trControl = fitControl,
  metric = "RMSE"
)

# Step 9: Model Evaluation on Test Data
# Define RMSE evaluation function
rmse_eval <- function(model, data, target) {
  predictions <- predict(model, newdata = data)
  rmse <- sqrt(mean((predictions - target)^2))
  return(rmse)
}

# Calculate RMSE for each model
xgb_rmse <- rmse_eval(xgb_model, test_data, test_data$CTR)
rf_rmse <- rmse_eval(rf_model, test_data, test_data$CTR)
#dt_rmse <- rmse_eval(dt_model, test_data, test_data$CTR)
lm_rmse <- rmse_eval(lm_model, test_data, test_data$CTR)
svm_rmse <- rmse_eval(svm_model, test_data, test_data$CTR)

# Combine results into a data frame for comparison
rmse_results <- data.frame(
  Model = c("XGBoost", "Random Forest", "Linear Regression", "SVM"),
  RMSE = c(xgb_rmse, rf_rmse, lm_rmse, svm_rmse)
)
print(rmse_results)

# Identify the best model based on RMSE
best_model_name <- rmse_results$Model[which.min(rmse_results$RMSE)]
cat("Best Model:", best_model_name, "\n")

# Step 10: Generate Predictions for Scoring Data Using Best Model
# Select best model based on RMSE
best_model <- switch(
  best_model_name,
  "XGBoost" = xgb_model,
  "Random Forest" = rf_model,
  "Linear Regression" = lm_model,
  "SVM" = svm_model
); best_model

# Generate predictions on scoring data
predictions <- predict(best_model, newdata = scoring_data_encoded)

# Prepare submission data frame
submission <- data.frame(ID = scoring_data$id, CTR = predictions)

# Write submission to CSV
write.csv(submission, "CTR_predictions7.csv", row.names = FALSE)