[pig][メモ]DataFuにあるPageRankを試してみた

Pig UDFのライブラリDataFuに、PageRankがあるらしい事を知ったので試してみた。
http://twitter.com/shiumachi/status/253478760119156736

ライブラリのダウンロードと展開

$ wget --no-check-certificate https://github.com/downloads/linkedin/datafu/datafu-0.0.4.tar.gz
$ tar zxvf datafu-0.0.4.tar.gz
$ ls datafu-0.0.4/dist

datafu-0.0.4-javadoc.jar  datafu-0.0.4.jar     datafu-docs.war
datafu-0.0.4-sources.jar  datafu-coverage.war

データの用意

[R]ネットワーク分析 - ネットワークの比較 - yokkunsの日記で使った、ハイテク企業の管理職21人の社会ネットワークの友人ネットワークを使う

library(sna)

## 隣接行列
FRIEND <- matrix(c(
                   0,1,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,
                   1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,
                   0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,
                   1,1,0,0,0,0,0,1,0,0,0,1,0,0,0,1,1,0,0,0,0,
                   0,1,0,0,0,0,0,0,1,0,1,0,0,1,0,0,1,0,1,0,1,
                   0,1,0,0,0,0,1,0,1,0,0,1,0,0,0,0,1,0,0,0,1,
                   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
                   0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
                   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
                   0,0,1,0,1,0,0,1,1,0,0,1,0,0,0,1,0,0,0,1,0,
                   1,1,1,1,1,0,0,1,1,0,0,1,1,0,1,0,1,1,1,0,0,
                   1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,
                   0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,
                   0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,
                   1,0,1,0,1,1,0,0,1,0,1,0,0,1,0,0,0,0,1,0,0,
                   1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
                   1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,0,0,1,1,1,
                   0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
                   1,1,1,0,1,0,0,0,0,0,1,1,0,1,1,0,0,0,0,1,0,
                   0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,
                   0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0),
                 nrow = 21, byrow = TRUE)

## エッジリストに変換
friend.edge.list <- as.edgelist.sna(FRIEND)

## 出力
write.table(friend.edge.list,file="friend.edgelist.tsv", row.names = FALSE, col.names = FALSE, sep="\t")
HDFSにput
$ hadoop dfs -put friend.edgelist.tsv ./tmp/friend_edgelist

pig実行

pigコード
register 'datafu-0.0.4.jar';

%default friend_edges_dir 'tmp/friend_edgelist'
%default output_dir './tmp/friend_pagerank'

define PageRank datafu.pig.linkanalysis.PageRank('dangling_nodes','true');

friend_edges = load '$friend_edges_dir' as (
        source:int,
        dest:int,
        weight:double);

friend_edges_grouped = foreach (group friend_edges by source) generate
    group as source,
    friend_edges.(dest,weight) as edges;

friend_ranks = foreach (group friend_edges_grouped all) generate
    flatten(PageRank(friend_edges_grouped.(source,edges))) as (source,rank);

friend_ranks = order friend_ranks by source parallel 1;
store friend_ranks into '$output_dir';
結果
1	0.10949851
2	0.15463765
3	0.021636453
4	0.08672039
5	0.026471648
6	0.015206676
7	0.027113214
8	0.046544723
9	0.02419316
10	0.012026792
11	0.032995425
12	0.06756917
13	0.011376936
14	0.03042857
15	0.029928315
16	0.04438733
17	0.05944765
18	0.07817628
19	0.029773977
20	0.016299177
21	0.07556794