Uplift Modelのパッケージを作ってみた

費用対効果の最大化するためのデータマイニング手法であるUplift ModelをRで実装してパッケージ化してみました。

ドキュメント書くのが面倒で、まだcranにはアップしてませんが、使い方を簡単に紹介したいと思います。
主に使う関数は、以下の2つ

buildUpliftTree(y.train, treat.train, x.train)
- 学習データから、費用対効果を最大化するような分類を行うモデルを構築する
classify(uplift.tree, x.new)
- 構築されたUpliftTreeを元に、新しいデータを分類する

ライブラリ読み込み

library(EasyUpliftTree)

データの用意

手元にいい感じのデータがないので、今回はsurvivalパッケージのcolonデータをちょっと加工して使用する。
実際は、医療用のデータだけど、以下のように考えて、y、treat、xを作る。

status	購買有無。0: 購入あり、1: 購入なし
rx	DM有無。Lev+5FU: DMあり、Obs: DMなし
その他	何か属性など様々な変数。

library(survival)
data(colon)
sample.data <- na.omit(colon[colon$rx != "Lev" & colon$etype == 2, ])
treat <- ifelse(sample.data$rx == "Lev+5FU", 1, 0)
y <- ifelse(sample.data$status == 0, 1, 0)
x <- sample.data[, c(4:9, 11:14)]

x$v1 <- factor(x$sex)
x$v2 <- factor(x$obstruct)
x$v3 <- factor(x$perfor)
x$v4 <- factor(x$adhere)
x$v5 <- factor(x$differ)
x$v6 <- factor(x$extent)
x$v7 <- factor(x$surg)
x$v8 <- factor(x$node4)

学習用のデータとテスト用データの作成

index <- 1:nrow(x)
train.index <- index[(index%%2 == 0)]
test.index <- index[index%%2 != 0]

y.train <- y[train.index]
x.train <- x[train.index, ]
treat.train <- treat[train.index]

y.test <- y[test.index]
x.test <- x[test.index, ]
treat.test <- treat[test.index]

Uplift Treeの構築

結果ベクトル、介入ベクトル、説明変数を、buildUpliftTree関数に突っ込むだけ

uplift.tree <- buildUpliftTree(y.train, treat.train, x.train)
print(uplift.tree)

## sex : 1 ?
##  R->nodes : 3 ?
##    R->perfor : 1 ?
##      R->age : 55.4 ?
##        R->1 
##        L->1 
##      L->nodes : 6 ?
##        R->age : 48 ?
##          R->v5 : 2 ?
##            R->1 
##            L->0.75 
##          L->nodes : 12.8 ?
##            R->1 
##            L->0 
##        L->v8 : 0 ?
##          R->v4 : 0 ?
##            R->0.777777777777778 
##            L->0 
##          L->age : 58.8 ?
##            R->0 
##            L->0 
##    L->v8 : 0 ?
##      R->extent : 3 ?
##        R->age : 55 ?
##          R->v4 : 0 ?
##            R->differ : 1.1 ?
##              R->0.777777777777778 
##              L->0.75 
##            L->0.5 
##          L->v7 : 0 ?
##            R->adhere : 0.199999999999999 ?
##              R->1 
##              L->1 
##            L->0 
##        L->0.727272727272727 
##      L->0 
##  L->age : 63 ?
##    R->v2 : 0 ?
##      R->nodes : 3.3 ?
##        R->age : 67.1 ?
##          R->age : 72.8 ?
##            R->1 
##            L->v8 : 0 ?
##              R->1 
##              L->0 
##          L->0.6 
##        L->v5 : 1 ?
##          R->extent : 2.5 ?
##            R->1 
##            L->1 
##          L->v5 : 2 ?
##            R->v3 : 0 ?
##              R->0.733333333333333 
##              L->0.5 
##            L->age : 72 ?
##              R->1 
##              L->0 
##      L->0.333333333333333 
##    L->surg : 1 ?
##      R->nodes : 2 ?
##        R->0.454545454545455 
##        L->0.6 
##      L->obstruct : 1 ?
##        R->1 
##        L->v5 : 1 ?
##          R->1 
##          L->adhere : 1 ?
##            R->1 
##            L->age : 48.8 ?
##              R->0.166666666666667 
##              L->0.142857142857143

購入率の確認

toDateFrame関数でデータフレームに変換して平均を取る

uplift.df <- toDataFrame(uplift.tree)
mean(uplift.df[uplift.df$node.type == "R" & uplift.df$treat == 1, "y"])
## [1] 0.7215
mean(uplift.df[uplift.df$node.type == "L" & uplift.df$treat ==  1, "y"])
## [1] 0.4667

RグループのDMを受け取ったユーザーの購入率は、Lグループよりもかなり高くなっている

テストデータをRグループとLグループに分類

classify関数に、上で構築したuplift.treeと、新しい説明変数を突っ込む。

x.test$node.type <- sapply(1:nrow(x.test), function(i) classify(uplift.tree, x.test[i, ]))

ROI比較

以下のような条件でROIを計算してみる

1人当たりのコスト : 100円
1個当たりの売上 : 200円

# 全体にDMを送った場合
roi.data.all <- data.frame(type = "All", 
                                        cost = length(y.test[treat.test == 1]) * 100, 
                                        amount = sum(y.test) * 200)

# RのグループにDM送った場合
## Rグループのtreat=0は介入があっても結果が変わらない、Lグループのtreat==1は、介入しなかった場合は、購入無しと厳しめな仮定をおいてる
roi.data.r <- data.frame(type = "R", 
                                      cost = length(y.test[x.test$node.type == "R"]) * 100, 
                                      amount = sum(y.test[!(x.test$node.type ==  "L" & treat.test == 1)]) * 200)

# ROIを算出
roi.data <- rbind(roi.data.all, roi.data.r)
roi.data$ROI <- roi.data$amount/roi.data$cost
roi.data

##   type  cost amount   ROI
## 1  All 15000  30200 2.013
## 2    R  5400  15800 2.926

Rグループに対してだけDMを送るとROIは2.9倍で、単純に全体に送った場合よりも費用対効果が高くなっている

今後

「EasyUpliftTree」という名前の通り、まだまだ簡易版って感じなので、これをたたき台としてiAnalysisの@isseing333さんと開発を進めていこうと思います！