[pig][メモ]DataFuにあるPageRankを試してみた
Pig UDFのライブラリDataFuに、PageRankがあるらしい事を知ったので試してみた。
・http://twitter.com/shiumachi/status/253478760119156736
ライブラリのダウンロードと展開
$ wget --no-check-certificate https://github.com/downloads/linkedin/datafu/datafu-0.0.4.tar.gz $ tar zxvf datafu-0.0.4.tar.gz $ ls datafu-0.0.4/dist datafu-0.0.4-javadoc.jar datafu-0.0.4.jar datafu-docs.war datafu-0.0.4-sources.jar datafu-coverage.war
データの用意
[R]ネットワーク分析 - ネットワークの比較 - yokkunsの日記で使った、ハイテク企業の管理職21人の社会ネットワークの友人ネットワークを使う
library(sna) ## 隣接行列 FRIEND <- matrix(c( 0,1,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0, 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1, 0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0, 1,1,0,0,0,0,0,1,0,0,0,1,0,0,0,1,1,0,0,0,0, 0,1,0,0,0,0,0,0,1,0,1,0,0,1,0,0,1,0,1,0,1, 0,1,0,0,0,0,1,0,1,0,0,1,0,0,0,0,1,0,0,0,1, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,1,0,1,0,0,1,1,0,0,1,0,0,0,1,0,0,0,1,0, 1,1,1,1,1,0,0,1,1,0,0,1,1,0,1,0,1,1,1,0,0, 1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1, 0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0, 1,0,1,0,1,1,0,0,1,0,1,0,0,1,0,0,0,0,1,0,0, 1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,0,0,1,1,1, 0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 1,1,1,0,1,0,0,0,0,0,1,1,0,1,1,0,0,0,0,1,0, 0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0, 0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0), nrow = 21, byrow = TRUE) ## エッジリストに変換 friend.edge.list <- as.edgelist.sna(FRIEND) ## 出力 write.table(friend.edge.list,file="friend.edgelist.tsv", row.names = FALSE, col.names = FALSE, sep="\t")
HDFSにput
$ hadoop dfs -put friend.edgelist.tsv ./tmp/friend_edgelist
pig実行
pigコード
register 'datafu-0.0.4.jar'; %default friend_edges_dir 'tmp/friend_edgelist' %default output_dir './tmp/friend_pagerank' define PageRank datafu.pig.linkanalysis.PageRank('dangling_nodes','true'); friend_edges = load '$friend_edges_dir' as ( source:int, dest:int, weight:double); friend_edges_grouped = foreach (group friend_edges by source) generate group as source, friend_edges.(dest,weight) as edges; friend_ranks = foreach (group friend_edges_grouped all) generate flatten(PageRank(friend_edges_grouped.(source,edges))) as (source,rank); friend_ranks = order friend_ranks by source parallel 1; store friend_ranks into '$output_dir';
結果
1 0.10949851 2 0.15463765 3 0.021636453 4 0.08672039 5 0.026471648 6 0.015206676 7 0.027113214 8 0.046544723 9 0.02419316 10 0.012026792 11 0.032995425 12 0.06756917 13 0.011376936 14 0.03042857 15 0.029928315 16 0.04438733 17 0.05944765 18 0.07817628 19 0.029773977 20 0.016299177 21 0.07556794