SlideShare a Scribd company logo
Big Data Analytics
with
Spark
October 04, 2015









Big Data Analytics with Spark
Big Data Analytics with Spark
Source: Cisco, IDC, Wikibon report 2013
1980s 1990-2000s 2010 - beyond
Big Data Analytics with Spark
Big Data Analytics with Spark
Quick Review
Big Data Analytics with Spark


Big Data Analytics with Spark

–

–

Big Data Analytics with Spark
Big Data Analytics with Spark
Big Data Analytics with Spark
Big Data Analytics with Spark





Big Data Analytics with Spark







–
–

–
Big Data Analytics with Spark
Big Data Analytics with Spark
Big Data Analytics with Spark
Big Data Analytics with Spark
Big Data Analytics with Spark
Big Data Analytics with Spark
Big Data Analytics with Spark
Big Data Analytics with Spark
Big Data Analytics with Spark






–
–




01 import java.io.IOException;
02 import java.util.StringTokenizer;
03
04 import org.apache.hadoop.conf.Configuration;
05 import org.apache.hadoop.fs.Path;
06 import org.apache.hadoop.io.IntWritable;
07 import org.apache.hadoop.io.Text;
08 import org.apache.hadoop.mapreduce.Job;
09 import org.apache.hadoop.mapreduce.Mapper;
10 import org.apache.hadoop.mapreduce.Reducer;
11 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
12 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
13
14 public class WordCount {
15
16 public static class TokenizerMapper
17 extends Mapper<Object, Text, Text, IntWritable>{
18
19 private final static IntWritable one = new IntWritable(1);
20 private Text word = new Text();
21
22 public void map(Object key, Text value, Context context
23 ) throws IOException, InterruptedException {
24 StringTokenizer itr = new StringTokenizer(value.toString());
25 while (itr.hasMoreTokens()) {
26 word.set(itr.nextToken());
27 context.write(word, one);
28 }
29 }
30 }
31
32 public static class IntSumReducer
33 extends Reducer<Text,IntWritable,Text,IntWritable> {
34 private IntWritable result = new IntWritable();
35
36 public void reduce(Text key, Iterable<IntWritable> values,
37 Context context
38 ) throws IOException, InterruptedException {
39 int sum = 0;
40 for (IntWritable val : values) {
41 sum += val.get();
42 }
43 result.set(sum);
44 context.write(key, result);
45 }
46 }
47
48 public static void main(String[] args) throws Exception {
49 Configuration conf = new Configuration();
50 Job job = Job.getInstance(conf, "word count");
51 job.setJarByClass(WordCount.class);
52 job.setMapperClass(TokenizerMapper.class);
53 job.setCombinerClass(IntSumReducer.class);
54 job.setReducerClass(IntSumReducer.class);
55 job.setOutputKeyClass(Text.class);
56 job.setOutputValueClass(IntWritable.class);
57 FileInputFormat.addInputPath(job, new Path(args[0]));
58 FileOutputFormat.setOutputPath(job, new Path(args[1]));
59 System.exit(job.waitForCompletion(true) ? 0 : 1);
60 }
61 }
01 import org.apache.spark.SparkContext
02 import org.apache.spark.SparkContext._
03
04 object WordCount {
05 def main(args: Array[String]): Unit = {
06 val sc = new SparkContext()
07 val lines = sc.textFile(args(0))
08 val wordCounts = lines.flatMap {line => line.split(" ")}
09 .map(word => (word, 1))
10 .reduceByKey(_ + _)
11 wordCounts.saveAsTextFile(args(1))
12 }
13 }

–
–

–
–
Big Data Analytics with Spark







Big Data Analytics with Spark
Big Data Analytics with Spark

–
–




Big Data Analytics with Spark
Big Data Analytics with Spark


Big Data Analytics with Spark





–


–
–

–
–
–
–

–







–
Big Data Analytics with Spark
Big Data Analytics with Spark









Big Data Analytics with Spark
Big Data Analytics with Spark





Big Data Analytics with Spark
Big Data Analytics with Spark
Big Data Analytics with Spark



Big Data Analytics with Spark
Big Data Analytics with Spark

–

Big Data Analytics with Spark
Big Data Analytics with Spark
Big Data Analytics with Spark
Big Data Analytics with Spark
Big Data Analytics with Spark


Big Data Analytics with Spark

More Related Content

Big Data Analytics with Spark

  • 6. Source: Cisco, IDC, Wikibon report 2013 1980s 1990-2000s 2010 - beyond
  • 35. 01 import java.io.IOException; 02 import java.util.StringTokenizer; 03 04 import org.apache.hadoop.conf.Configuration; 05 import org.apache.hadoop.fs.Path; 06 import org.apache.hadoop.io.IntWritable; 07 import org.apache.hadoop.io.Text; 08 import org.apache.hadoop.mapreduce.Job; 09 import org.apache.hadoop.mapreduce.Mapper; 10 import org.apache.hadoop.mapreduce.Reducer; 11 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 12 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 13 14 public class WordCount { 15 16 public static class TokenizerMapper 17 extends Mapper<Object, Text, Text, IntWritable>{ 18 19 private final static IntWritable one = new IntWritable(1); 20 private Text word = new Text(); 21 22 public void map(Object key, Text value, Context context 23 ) throws IOException, InterruptedException { 24 StringTokenizer itr = new StringTokenizer(value.toString()); 25 while (itr.hasMoreTokens()) { 26 word.set(itr.nextToken()); 27 context.write(word, one); 28 } 29 } 30 } 31 32 public static class IntSumReducer 33 extends Reducer<Text,IntWritable,Text,IntWritable> { 34 private IntWritable result = new IntWritable(); 35 36 public void reduce(Text key, Iterable<IntWritable> values, 37 Context context 38 ) throws IOException, InterruptedException { 39 int sum = 0; 40 for (IntWritable val : values) { 41 sum += val.get(); 42 } 43 result.set(sum); 44 context.write(key, result); 45 } 46 } 47 48 public static void main(String[] args) throws Exception { 49 Configuration conf = new Configuration(); 50 Job job = Job.getInstance(conf, "word count"); 51 job.setJarByClass(WordCount.class); 52 job.setMapperClass(TokenizerMapper.class); 53 job.setCombinerClass(IntSumReducer.class); 54 job.setReducerClass(IntSumReducer.class); 55 job.setOutputKeyClass(Text.class); 56 job.setOutputValueClass(IntWritable.class); 57 FileInputFormat.addInputPath(job, new Path(args[0])); 58 FileOutputFormat.setOutputPath(job, new Path(args[1])); 59 System.exit(job.waitForCompletion(true) ? 0 : 1); 60 } 61 } 01 import org.apache.spark.SparkContext 02 import org.apache.spark.SparkContext._ 03 04 object WordCount { 05 def main(args: Array[String]): Unit = { 06 val sc = new SparkContext() 07 val lines = sc.textFile(args(0)) 08 val wordCounts = lines.flatMap {line => line.split(" ")} 09 .map(word => (word, 1)) 10 .reduceByKey(_ + _) 11 wordCounts.saveAsTextFile(args(1)) 12 } 13 }