############################################# ## 11/06/2012 ############################################# ## rhipe setup library(Rhipe) rhinit() rhoptions(runner=paste("/opt/R-2.15.1/bin/", rhoptions()$runner, sep="")) ############################################# ## how hadoop distributes "lapply" input ## number of map tasks or number of input splits = 3 ############################################# ## map-reduce map = expression({ rhcollect(map.keys, map.values) }) mr = rhmr( map = map, ofolder = "/ln/li271/tmp/lapply.distribute", inout = c('lapply','sequence'), N = 10, mapred = list( mapred.map.tasks=3, mapred.reduce.tasks=0 ) ) ex = rhex(mr, async=FALSE) ## read in output rst = rhread("/ln/li271/tmp/lapply.distribute") ## look at what's in map.keys/map.values, note they are the same for "lapply" input types class(rst) length(rst) rst[[1]] class(rst[[1]]) length(rst[[1]]) rst[[1]][[1]] ## how many keys/values in each map.keys/map.values sapply(rst, function(r) length(r[[1]])) ############################################# ## scale up "lapply" input ## buffer size will play a role here ## number of map tasks or number of input splits = 3 ############################################# ## map-reduce map = expression({ rhcollect(map.keys, map.values) }) mr = rhmr( map = map, ofolder = "/ln/li271/tmp/lapply.buff3000", inout = c('lapply','sequence'), N = 10000, mapred = list( mapred.map.tasks=3, mapred.reduce.tasks=0 ) ) ex = rhex(mr, async=FALSE) ## read in output rst = rhread("/ln/li271/tmp/lapply.buff3000") ## how many keys/values in each map.keys/map.values ## default buffer size is 3000 sapply(rst, function(r) length(r[[1]])) ############################################# ## look at buffer size with input data ############################################# ## look at the input data rhls("/ln/li271/tmp/lec5") ## two datasets of size 1GB rhls("/ln/li271/tmp/lec5/size1GB.each16MB") rhls("/ln/li271/tmp/lec5/size1GB.each32KB") ## dataset1 has 64 subsets of size 16MB each a = rhread("/ln/li271/tmp/lec5/size1GB.each16MB", max=2) length(a[[1]]) head(a[[1]][[2]]) log2(length(a[[1]][[2]])) print(object.size(a[[1]][[2]]), units="Mb") ## map-reduce to see number of keys/values in each map.keys/map.values in dataset1 ## note mapred.reduce.tasks=0 so that each mapper will produce 1 output file map = expression({ rhcollect(unlist(map.keys), length(map.values)) }) mr = rhmr( map = map, ifolder = "/ln/li271/tmp/lec5/size1GB.each16MB", ofolder = "/ln/li271/tmp/lec5/size1GB.each16MB.npairs", inout = c('sequence','sequence'), mapred = list( mapred.map.tasks=3, mapred.reduce.tasks=0 ) ) ex = rhex(mr, async=FALSE) a = rhread("/ln/li271/tmp/lec5/size1GB.each16MB.npairs") sapply(a, '[[', 2) a ## dataset2 has 1024 subsets of size 32KB each a = rhread("/ln/li271/tmp/lec5/size1GB.each32KB", max=2) length(a[[1]]) head(a[[1]][[2]]) log2(length(a[[1]][[2]])) print(object.size(a[[1]][[2]]), units="Kb") ## map-reduce to see number of keys/values in each map.keys/map.values in dataset2 ## note mapred.reduce.tasks=0 so that each mapper will produce 1 output file map = expression({ rhcollect(unlist(map.keys), length(map.values)) }) mr = rhmr( map = map, ifolder = "/ln/li271/tmp/lec5/size1GB.each32KB", ofolder = "/ln/li271/tmp/lec5/size1GB.each32KB.npairs", inout = c('sequence','sequence'), mapred = list( mapred.map.tasks=3, mapred.reduce.tasks=0 ) ) ex = rhex(mr, async=FALSE) a = rhread("/ln/li271/tmp/lec5/size1GB.each32KB.npairs") ## number of key,value pairs sapply(a, '[[', 2)