SlideShare a Scribd company logo
R + Hadoop = Big Data Analytics
R + Hadoop = Big Data Analytics. How Revolution Analytics' RHadoop Project Allows All Developers to Leverage the MapReduce Framework
R + Hadoop = Big Data Analytics. How Revolution Analytics' RHadoop Project Allows All Developers to Leverage the MapReduce Framework
R + Hadoop = Big Data Analytics. How Revolution Analytics' RHadoop Project Allows All Developers to Leverage the MapReduce Framework
R + Hadoop = Big Data Analytics. How Revolution Analytics' RHadoop Project Allows All Developers to Leverage the MapReduce Framework
6
7
8
9
R + Hadoop = Big Data Analytics. How Revolution Analytics' RHadoop Project Allows All Developers to Leverage the MapReduce Framework
R + Hadoop = Big Data Analytics. How Revolution Analytics' RHadoop Project Allows All Developers to Leverage the MapReduce Framework
R + Hadoop = Big Data Analytics. How Revolution Analytics' RHadoop Project Allows All Developers to Leverage the MapReduce Framework
library(rmr)

mapreduce(…)
lapply(data, function)

mapreduce(big.data, map = function)
Expose MR   Hide MR
                             Hive, Pig




  Rmr, Rhipe,               Cascalog,
Rmr
Dumbo, Pydoop,              Scalding,
    Hadoopy                  Scrunch




       Java,                Cascading,
        C++                  Crunch
mapreduce(input, output, map, reduce)
x = from.dfs(hdfs.object)

hdfs.object = to.dfs(x)
small.ints = 1:1000
lapply(small.ints, function(x) x^2)

small.ints = to.dfs(1:1000)
mapreduce(input = small.ints,
        map = function(k,v) keyval(v, v^2))

groups = rbinom(32, n = 50, prob = 0.4)
tapply(groups, groups, length)

groups = to.dfs(groups)
mapreduce(input = groups,
     map = function(k, v) keyval(v,1),
     reduce = function(k,vv)
               keyval(k, length(vv)))
condition = function(x) x > 10


out = mapreduce(
     input = input,
     map = function(k,v)
          if (condition(v)) keyval(k,v))
kmeans =
 function(points, ncenters, iterations = 10,    distfun = function(a,b) norm(as.matrix(a-b), type = 'F')) {
newCenters = kmeans.iter(points, distfun, ncenters = ncenters) for(i in 1:iterations) {  newCenters =
kmeans.iter(points, distfun, centers = newCenters)} newCenters}




kmeans.iter = function(points, distfun, ncenters = dim(centers)[1], centers = NULL) { from.dfs(
   mapreduce(
    input = points,      map = if (is.null(centers)) {        function(k,v) keyval(sample(1:ncenters,1),v)}           else
{         function(k,v) {             distances = apply(centers, 1, function(c) distfun(c,v))
keyval(centers[which.min(distances),], v)}},      reduce = function(k,vv) keyval(NULL, apply(do.call(rbind, vv), 2,
mean))),    to.data.frame = T)}
#!/usr/bin/python
import sys
from math import fabs
from org.apache.pig.scripting import Pig

filename = "student.txt"
k=4
tolerance = 0.01

MAX_SCORE = 4
MIN_SCORE = 0
MAX_ITERATION = 100

# initial centroid, equally divide the space
initial_centroids = ""
last_centroids = [None] * k
for i in range(k):
   last_centroids[i] = MIN_SCORE + float(i)/k*(MAX_SCORE-MIN_SCORE)
   initial_centroids = initial_centroids + str(last_centroids[i])
   if i!=k-1:
       initial_centroids = initial_centroids + ":"

P = Pig.compile("""register udf.jar
            DEFINE find_centroid FindCentroid('$centroids');
            raw = load 'student.txt' as (name:chararray, age:int, gpa:double);
            centroided = foreach raw generate gpa, find_centroid(gpa) as centroid;
            grouped = group centroided by centroid;
            result = foreach grouped generate group, AVG(centroided.gpa);
            store result into 'output';
          """)

converged = False
iter_num = 0
while iter_num<MAX_ITERATION:
   Q = P.bind({'centroids':initial_centroids})
   results = Q.runSingle()
if results.isSuccessful() == "FAILED":
      raise "Pig job failed"
  iter = results.result("result").iterator()
  centroids = [None] * k
  distance_move = 0
  # get new centroid of this iteration, caculate the moving distance with last iteration
  for i in range(k):
      tuple = iter.next()
      centroids[i] = float(str(tuple.get(1)))
      distance_move = distance_move + fabs(last_centroids[i]-centroids[i])
  distance_move = distance_move / k;
  Pig.fs("rmr output")
  print("iteration " + str(iter_num))
  print("average distance moved: " + str(distance_move))
  if distance_move<tolerance:
      sys.stdout.write("k-means converged at centroids: [")
      sys.stdout.write(",".join(str(v) for v in centroids))
      sys.stdout.write("]n")
      converged = True
      break
  last_centroids = centroids[:]
  initial_centroids = ""
  for i in range(k):
      initial_centroids = initial_centroids + str(last_centroids[i])
      if i!=k-1:
          initial_centroids = initial_centroids + ":"
  iter_num += 1

if not converged:
    print("not converge after " + str(iter_num) + " iterations")
    sys.stdout.write("last centroids: [")
    sys.stdout.write(",".join(str(v) for v in last_centroids))
    sys.stdout.write("]n")
import java.io.IOException;

import org.apache.pig.EvalFunc;
import org.apache.pig.data.Tuple;


public class FindCentroid extends EvalFunc<Double> {
  double[] centroids;
  public FindCentroid(String initialCentroid) {
     String[] centroidStrings = initialCentroid.split(":");
     centroids = new double[centroidStrings.length];
     for (int i=0;i<centroidStrings.length;i++)
        centroids[i] = Double.parseDouble(centroidStrings[i]);
  }
  @Override
  public Double exec(Tuple input) throws IOException {
     double min_distance = Double.MAX_VALUE;
     double closest_centroid = 0;
     for (double centroid : centroids) {
        double distance = Math.abs(centroid - (Double)input.get(0));
        if (distance < min_distance) {
            min_distance = distance;
            closest_centroid = centroid;
        }
     }
     return closest_centroid;
  }

}
mapreduce(mapreduce(…

mapreduce(input = c(input1, input2), …)

equijoin = function(
  left.input, right.input, input,
  output,
  outer,
  map.left, map.right,
  reduce, reduce.all)
out1 = mapreduce(…)
mapreduce(input = out1, <xyz>)
mapreduce(input = out1, <abc>)

abstract.job = function(input, output, …) {
  …
  result = mapreduce(input = input,
              output = output)
  …
  result}
input.format, output.format, format
reduce.on.data.frame, to.data.frame
local, hadoop backends
profiling
R + Hadoop = Big Data Analytics. How Revolution Analytics' RHadoop Project Allows All Developers to Leverage the MapReduce Framework
R + Hadoop = Big Data Analytics. How Revolution Analytics' RHadoop Project Allows All Developers to Leverage the MapReduce Framework
R + Hadoop = Big Data Analytics. How Revolution Analytics' RHadoop Project Allows All Developers to Leverage the MapReduce Framework

More Related Content

R + Hadoop = Big Data Analytics. How Revolution Analytics' RHadoop Project Allows All Developers to Leverage the MapReduce Framework

  • 1. R + Hadoop = Big Data Analytics
  • 6. 6
  • 7. 7
  • 8. 8
  • 9. 9
  • 15. Expose MR Hide MR Hive, Pig Rmr, Rhipe, Cascalog, Rmr Dumbo, Pydoop, Scalding, Hadoopy Scrunch Java, Cascading, C++ Crunch
  • 18. small.ints = 1:1000 lapply(small.ints, function(x) x^2) small.ints = to.dfs(1:1000) mapreduce(input = small.ints, map = function(k,v) keyval(v, v^2)) groups = rbinom(32, n = 50, prob = 0.4) tapply(groups, groups, length) groups = to.dfs(groups) mapreduce(input = groups, map = function(k, v) keyval(v,1), reduce = function(k,vv) keyval(k, length(vv)))
  • 19. condition = function(x) x > 10 out = mapreduce( input = input, map = function(k,v) if (condition(v)) keyval(k,v))
  • 20. kmeans = function(points, ncenters, iterations = 10, distfun = function(a,b) norm(as.matrix(a-b), type = 'F')) { newCenters = kmeans.iter(points, distfun, ncenters = ncenters) for(i in 1:iterations) { newCenters = kmeans.iter(points, distfun, centers = newCenters)} newCenters} kmeans.iter = function(points, distfun, ncenters = dim(centers)[1], centers = NULL) { from.dfs( mapreduce( input = points, map = if (is.null(centers)) { function(k,v) keyval(sample(1:ncenters,1),v)} else { function(k,v) { distances = apply(centers, 1, function(c) distfun(c,v)) keyval(centers[which.min(distances),], v)}}, reduce = function(k,vv) keyval(NULL, apply(do.call(rbind, vv), 2, mean))), to.data.frame = T)}
  • 21. #!/usr/bin/python import sys from math import fabs from org.apache.pig.scripting import Pig filename = "student.txt" k=4 tolerance = 0.01 MAX_SCORE = 4 MIN_SCORE = 0 MAX_ITERATION = 100 # initial centroid, equally divide the space initial_centroids = "" last_centroids = [None] * k for i in range(k): last_centroids[i] = MIN_SCORE + float(i)/k*(MAX_SCORE-MIN_SCORE) initial_centroids = initial_centroids + str(last_centroids[i]) if i!=k-1: initial_centroids = initial_centroids + ":" P = Pig.compile("""register udf.jar DEFINE find_centroid FindCentroid('$centroids'); raw = load 'student.txt' as (name:chararray, age:int, gpa:double); centroided = foreach raw generate gpa, find_centroid(gpa) as centroid; grouped = group centroided by centroid; result = foreach grouped generate group, AVG(centroided.gpa); store result into 'output'; """) converged = False iter_num = 0 while iter_num<MAX_ITERATION: Q = P.bind({'centroids':initial_centroids}) results = Q.runSingle()
  • 22. if results.isSuccessful() == "FAILED": raise "Pig job failed" iter = results.result("result").iterator() centroids = [None] * k distance_move = 0 # get new centroid of this iteration, caculate the moving distance with last iteration for i in range(k): tuple = iter.next() centroids[i] = float(str(tuple.get(1))) distance_move = distance_move + fabs(last_centroids[i]-centroids[i]) distance_move = distance_move / k; Pig.fs("rmr output") print("iteration " + str(iter_num)) print("average distance moved: " + str(distance_move)) if distance_move<tolerance: sys.stdout.write("k-means converged at centroids: [") sys.stdout.write(",".join(str(v) for v in centroids)) sys.stdout.write("]n") converged = True break last_centroids = centroids[:] initial_centroids = "" for i in range(k): initial_centroids = initial_centroids + str(last_centroids[i]) if i!=k-1: initial_centroids = initial_centroids + ":" iter_num += 1 if not converged: print("not converge after " + str(iter_num) + " iterations") sys.stdout.write("last centroids: [") sys.stdout.write(",".join(str(v) for v in last_centroids)) sys.stdout.write("]n")
  • 23. import java.io.IOException; import org.apache.pig.EvalFunc; import org.apache.pig.data.Tuple; public class FindCentroid extends EvalFunc<Double> { double[] centroids; public FindCentroid(String initialCentroid) { String[] centroidStrings = initialCentroid.split(":"); centroids = new double[centroidStrings.length]; for (int i=0;i<centroidStrings.length;i++) centroids[i] = Double.parseDouble(centroidStrings[i]); } @Override public Double exec(Tuple input) throws IOException { double min_distance = Double.MAX_VALUE; double closest_centroid = 0; for (double centroid : centroids) { double distance = Math.abs(centroid - (Double)input.get(0)); if (distance < min_distance) { min_distance = distance; closest_centroid = centroid; } } return closest_centroid; } }
  • 24. mapreduce(mapreduce(… mapreduce(input = c(input1, input2), …) equijoin = function( left.input, right.input, input, output, outer, map.left, map.right, reduce, reduce.all)
  • 25. out1 = mapreduce(…) mapreduce(input = out1, <xyz>) mapreduce(input = out1, <abc>) abstract.job = function(input, output, …) { … result = mapreduce(input = input, output = output) … result}
  • 26. input.format, output.format, format reduce.on.data.frame, to.data.frame local, hadoop backends profiling