risenfall

Use Free Software.

Distributed grep using Hadoop

leave a comment »

Hadoop word count example is commonly used to introduce mapreduce concepts. I have alteredĀ 
the word count sample to do pattern matching or work like UNIX grep command.

first copy the text file to HDFS location.

bin/hadoop dfs -copyFromLocal <local-dir> <hdfs-dir>
bin/hadoop jar <path>/grep.jar org.myorg.Grep <hdfs-input-dir> <hdfs-output-dir> <pattern>

package org.myorg;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;

import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class Grep {

    public static class Map extends MapReduceBase
            implements Mapper<LongWritable, Text, Text, IntWritable> {
        private final static IntWritable one = new IntWritable(1);
        private Text word = new Text();
        private Pattern pattern;
        private int group;

        public void configure(JobConf job) {
            pattern = Pattern.compile(job.get("mapred.mapper.regex"));
            group = job.getInt("mapred.mapper.regex.group", 0);
        }

        public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output,
                        Reporter reporter) throws IOException {
            String line = value.toString();
            Matcher matcher = pattern.matcher(line);
            if (matcher.find()) {
                output.collect(new Text(line), one);
            }
        }
    }

    public static void main(String[] args) throws Exception {
        if (args.length < 3) {
            System.out.println("Grep <inDir> <outDir> <regex> [<group>]");
            return;
        }
        JobConf conf = new JobConf(Grep.class);
        conf.setJobName("Grep");
        conf.setOutputKeyClass(Text.class);
        conf.setOutputValueClass(IntWritable.class);
        conf.setMapperClass(Map.class);
        conf.setInputFormat(TextInputFormat.class);
        conf.setOutputFormat(TextOutputFormat.class);

        conf.set("mapred.mapper.regex", args[2]);
        if (args.length == 4) {
            conf.set("mapred.mapper.regex.group", args[3]);
        }
        FileInputFormat.setInputPaths(conf, new Path(args[0]));
        FileOutputFormat.setOutputPath(conf, new Path(args[1]));
        JobClient.runJob(conf);
    }
}

Advertisement

Written by risenfall

January 8, 2012 at 5:18 pm

Posted in hadoop, HDFS

Tagged with

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Connecting to %s

Follow

Get every new post delivered to your Inbox.