Distributed grep using Hadoop
Hadoop word count example is commonly used to introduce mapreduce concepts. I have alteredĀ the word count sample to do pattern matching or work like UNIX grep command.
first copy the text file to HDFS location.
bin/hadoop dfs -copyFromLocal <local-dir> <hdfs-dir> bin/hadoop jar <path>/grep.jar org.myorg.Grep <hdfs-input-dir> <hdfs-output-dir> <pattern>
package org.myorg;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class Grep {
public static class Map extends MapReduceBase
implements Mapper<LongWritable, Text, Text, IntWritable> {
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
private Pattern pattern;
private int group;
public void configure(JobConf job) {
pattern = Pattern.compile(job.get("mapred.mapper.regex"));
group = job.getInt("mapred.mapper.regex.group", 0);
}
public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output,
Reporter reporter) throws IOException {
String line = value.toString();
Matcher matcher = pattern.matcher(line);
if (matcher.find()) {
output.collect(new Text(line), one);
}
}
}
public static void main(String[] args) throws Exception {
if (args.length < 3) {
System.out.println("Grep <inDir> <outDir> <regex> [<group>]");
return;
}
JobConf conf = new JobConf(Grep.class);
conf.setJobName("Grep");
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(IntWritable.class);
conf.setMapperClass(Map.class);
conf.setInputFormat(TextInputFormat.class);
conf.setOutputFormat(TextOutputFormat.class);
conf.set("mapred.mapper.regex", args[2]);
if (args.length == 4) {
conf.set("mapred.mapper.regex.group", args[3]);
}
FileInputFormat.setInputPaths(conf, new Path(args[0]));
FileOutputFormat.setOutputPath(conf, new Path(args[1]));
JobClient.runJob(conf);
}
}
Advertisement


