#############################################
## 11/06/2012
#############################################


## rhipe setup
library(Rhipe)
rhinit()
rhoptions(runner=paste("/opt/R-2.15.1/bin/", rhoptions()$runner, sep=""))


#############################################
## how hadoop distributes "lapply" input
## number of map tasks or number of input splits = 3
#############################################
## map-reduce
map = expression({
	rhcollect(map.keys, map.values)
})
mr = rhmr(
	map = map,
	ofolder = "/ln/li271/tmp/lapply.distribute",
	inout = c('lapply','sequence'),
	N = 10,
	mapred = list( mapred.map.tasks=3, mapred.reduce.tasks=0 )
)
ex = rhex(mr, async=FALSE)

## read in output
rst = rhread("/ln/li271/tmp/lapply.distribute")

## look at what's in map.keys/map.values, note they are the same for "lapply" input types
class(rst)
length(rst)
rst[[1]]
class(rst[[1]])
length(rst[[1]])
rst[[1]][[1]]

## how many keys/values in each map.keys/map.values
sapply(rst, function(r) length(r[[1]]))


#############################################
## scale up "lapply" input 
## buffer size will play a role here
## number of map tasks or number of input splits = 3
#############################################
## map-reduce
map = expression({
	rhcollect(map.keys, map.values)
})
mr = rhmr(
	map = map,
	ofolder = "/ln/li271/tmp/lapply.buff3000",
	inout = c('lapply','sequence'),
	N = 10000,
	mapred = list( mapred.map.tasks=3, mapred.reduce.tasks=0 )
)
ex = rhex(mr, async=FALSE)

## read in output
rst = rhread("/ln/li271/tmp/lapply.buff3000")

## how many keys/values in each map.keys/map.values
## default buffer size is 3000
sapply(rst, function(r) length(r[[1]]))


#############################################
## look at buffer size with input data
#############################################
## look at the input data
rhls("/ln/li271/tmp/lec5")

## two datasets of size 1GB
rhls("/ln/li271/tmp/lec5/size1GB.each16MB")
rhls("/ln/li271/tmp/lec5/size1GB.each32KB")

## dataset1 has 64 subsets of size 16MB each
a = rhread("/ln/li271/tmp/lec5/size1GB.each16MB", max=2)
length(a[[1]])
head(a[[1]][[2]])
log2(length(a[[1]][[2]]))
print(object.size(a[[1]][[2]]), units="Mb")

## map-reduce to see number of keys/values in each map.keys/map.values in dataset1
## note mapred.reduce.tasks=0 so that each mapper will produce 1 output file
map = expression({
	rhcollect(unlist(map.keys), length(map.values))
})
mr = rhmr(
	map = map,
	ifolder = "/ln/li271/tmp/lec5/size1GB.each16MB",
	ofolder = "/ln/li271/tmp/lec5/size1GB.each16MB.npairs",
	inout = c('sequence','sequence'),
	mapred = list( mapred.map.tasks=3, mapred.reduce.tasks=0 )
)
ex = rhex(mr, async=FALSE)

a = rhread("/ln/li271/tmp/lec5/size1GB.each16MB.npairs")
sapply(a, '[[', 2)
a

## dataset2 has 1024 subsets of size 32KB each
a = rhread("/ln/li271/tmp/lec5/size1GB.each32KB", max=2)
length(a[[1]])
head(a[[1]][[2]])
log2(length(a[[1]][[2]]))
print(object.size(a[[1]][[2]]), units="Kb")

## map-reduce to see number of keys/values in each map.keys/map.values in dataset2
## note mapred.reduce.tasks=0 so that each mapper will produce 1 output file
map = expression({
	rhcollect(unlist(map.keys), length(map.values))
})
mr = rhmr(
	map = map,
	ifolder = "/ln/li271/tmp/lec5/size1GB.each32KB",
	ofolder = "/ln/li271/tmp/lec5/size1GB.each32KB.npairs",
	inout = c('sequence','sequence'),
	mapred = list( mapred.map.tasks=3, mapred.reduce.tasks=0 )
)
ex = rhex(mr, async=FALSE)

a = rhread("/ln/li271/tmp/lec5/size1GB.each32KB.npairs")
## number of key,value pairs
sapply(a, '[[', 2)