Java Code Examples for parquet.hadoop.ParquetOutputFormat

The following are top voted examples for showing how to use parquet.hadoop.ParquetOutputFormat. These examples are extracted from open source projects. You can vote up the examples you like and your votes will be used in our system to product more good examples. 

 
Example 1
Project: iow-hadoop-streaming   File: ParquetAsTextOutputFormat.java View source code Vote up 7 votes
privatestaticCompressionCodecName getCodec(JobConf conf){CompressionCodecName codec;if(ParquetOutputFormat.isCompressionSet(conf)){// explicit parquet config
          codec =ParquetOutputFormat.getCompression(conf);}elseif(getCompressOutput(conf)){// from hadoop config// find the right codecClass<?> codecClass = getOutputCompressorClass(conf,DefaultCodec.class);
          LOG.info("Compression set through hadoop codec: "+ codecClass.getName());
          codec =CompressionCodecName.fromCompressionCodec(codecClass);}else{
          codec =CompressionCodecName.UNCOMPRESSED;}

        LOG.info("Compression: "+ codec.name());return codec;}
Example 2
Project: tajo   File: ParquetAppender.java View source code Vote up 6 votes
/**
 * Creates a new ParquetAppender.
 *
 * @param conf Configuration properties.
 * @param schema The table schema.
 * @param meta The table metadata.
 * @param workDir The path of the Parquet file to write to.
 */publicParquetAppender(Configuration conf,TaskAttemptId taskAttemptId,Schema schema,TableMeta meta,Path workDir)throwsIOException{super(conf, taskAttemptId, schema, meta, workDir);this.blockSize =Integer.parseInt(
      meta.getOption(ParquetOutputFormat.BLOCK_SIZE,StorageConstants.PARQUET_DEFAULT_BLOCK_SIZE));this.pageSize =Integer.parseInt(
      meta.getOption(ParquetOutputFormat.PAGE_SIZE,StorageConstants.PARQUET_DEFAULT_PAGE_SIZE));this.compressionCodecName =CompressionCodecName.fromConf(
      meta.getOption(ParquetOutputFormat.COMPRESSION,StorageConstants.PARQUET_DEFAULT_COMPRESSION_CODEC_NAME));this.enableDictionary =Boolean.parseBoolean(
      meta.getOption(ParquetOutputFormat.ENABLE_DICTIONARY,StorageConstants.PARQUET_DEFAULT_IS_DICTIONARY_ENABLED));this.validating =Boolean.parseBoolean(
      meta.getOption(ParquetOutputFormat.VALIDATION,StorageConstants.PARQUET_DEFAULT_IS_VALIDATION_ENABLED));}
<iframe id="aswift_5" style="list-style: none outside none; margin: 0px; padding: 0px; left: 0px; position: absolute; top: 0px;" name="aswift_5" frameborder="0" marginwidth="0" marginheight="0" scrolling="no" width="728" height="90"></iframe>
Example 3
Project: pbase   File: TestInputOutputFormat.java View source code Vote up 6 votes
privatevoid runMapReduceJob(CompressionCodecName codec,Map<String,String> extraConf)throwsIOException,ClassNotFoundException,InterruptedException{Configuration conf =newConfiguration(this.conf);for(Map.Entry<String,String> entry : extraConf.entrySet()){
        conf.set(entry.getKey(), entry.getValue());}finalFileSystem fileSystem = parquetPath.getFileSystem(conf);
    fileSystem.delete(parquetPath,true);
    fileSystem.delete(outputPath,true);{
        writeJob =newJob(conf,"write");TextInputFormat.addInputPath(writeJob, inputPath);
        writeJob.setInputFormatClass(TextInputFormat.class);
        writeJob.setNumReduceTasks(0);ParquetOutputFormat.setCompression(writeJob, codec);ParquetOutputFormat.setOutputPath(writeJob, parquetPath);
        writeJob.setOutputFormatClass(ParquetOutputFormat.class);
        writeJob.setMapperClass(readMapperClass);ParquetOutputFormat.setWriteSupportClass(writeJob,MyWriteSupport.class);GroupWriteSupport.setSchema(MessageTypeParser.parseMessageType(writeSchema),
                writeJob.getConfiguration());
        writeJob.submit();
        waitForJob(writeJob);}{
        conf.set(ReadSupport.PARQUET_READ_SCHEMA, readSchema);
        readJob =newJob(conf,"read");

        readJob.setInputFormatClass(ParquetInputFormat.class);ParquetInputFormat.setReadSupportClass(readJob,MyReadSupport.class);ParquetInputFormat.setInputPaths(readJob, parquetPath);
        readJob.setOutputFormatClass(TextOutputFormat.class);TextOutputFormat.setOutputPath(readJob, outputPath);
        readJob.setMapperClass(writeMapperClass);
        readJob.setNumReduceTasks(0);
        readJob.submit();
        waitForJob(readJob);}}
Example 4
Project: pbase   File: CodecConfigTest.java View source code Vote up 6 votes
publicvoid shouldUseParquetFlagToSetCodec(String codecNameStr,CompressionCodecName expectedCodec)throwsIOException{//Test mapreduce APIJob job =newJob();Configuration conf = job.getConfiguration();
        conf.set(ParquetOutputFormat.COMPRESSION, codecNameStr);TaskAttemptContext task =ContextUtil.newTaskAttemptContext(conf,newTaskAttemptID(newTaskID(newJobID("test",1),false,1),1));Assert.assertEquals(CodecConfig.from(task).getCodec(), expectedCodec);//Test mapred APIJobConf jobConf =newJobConf();
        jobConf.set(ParquetOutputFormat.COMPRESSION, codecNameStr);Assert.assertEquals(CodecConfig.from(jobConf).getCodec(), expectedCodec);}
<iframe id="aswift_6" style="list-style: none outside none; margin: 0px; padding: 0px; left: 0px; position: absolute; top: 0px;" name="aswift_6" frameborder="0" marginwidth="0" marginheight="0" scrolling="no" width="728" height="90"></iframe>
Example 5
Project: pbase   File: DeprecatedParquetOutputFormat.java View source code Vote up 6 votes
publicRecordWriterWrapper(ParquetOutputFormat<V> realOutputFormat,FileSystem fs,JobConf conf,String name,Progressable progress)throwsIOException{CompressionCodecName codec = getCodec(conf);String extension = codec.getExtension()+".parquet";Path file = getDefaultWorkFile(conf, name, extension);try{
        realWriter =(ParquetRecordWriter<V>) realOutputFormat.getRecordWriter(conf, file, codec);}catch(InterruptedException e){Thread.interrupted();thrownewIOException(e);}}
Example 6
Project: tajo-cdh   File: ParquetAppender.java View source code Vote up 6 votes
/**
 * Creates a new ParquetAppender.
 *
 * @param conf Configuration properties.
 * @param schema The table schema.
 * @param meta The table metadata.
 * @param path The path of the Parquet file to write to.
 */publicParquetAppender(Configuration conf,Schema schema,TableMeta meta,Path path)throwsIOException{super(conf, schema, meta, path);this.blockSize =Integer.parseInt(
      meta.getOption(ParquetOutputFormat.BLOCK_SIZE));this.pageSize =Integer.parseInt(
      meta.getOption(ParquetOutputFormat.PAGE_SIZE));this.compressionCodecName =CompressionCodecName.fromConf(
      meta.getOption(ParquetOutputFormat.COMPRESSION));this.enableDictionary =Boolean.parseBoolean(
      meta.getOption(ParquetOutputFormat.ENABLE_DICTIONARY));this.validating =Boolean.parseBoolean(
      meta.getOption(ParquetOutputFormat.VALIDATION));}
Example 7
Project: tajo-cdh   File: StorageUtil.java View source code Vote up 6 votes
publicstaticOptions newPhysicalProperties(CatalogProtos.StoreType type){Options options =newOptions();if(CatalogProtos.StoreType.CSV == type){
    options.put(CSVFILE_DELIMITER, DEFAULT_FIELD_DELIMITER);}elseif(CatalogProtos.StoreType.RCFILE == type){
    options.put(RCFILE_SERDE, DEFAULT_BINARY_SERDE);}elseif(CatalogProtos.StoreType.SEQUENCEFILE == type){
    options.put(SEQUENCEFILE_SERDE, DEFAULT_TEXT_SERDE);
    options.put(SEQUENCEFILE_DELIMITER, DEFAULT_FIELD_DELIMITER);}elseif(type ==CatalogProtos.StoreType.PARQUET){
    options.put(ParquetOutputFormat.BLOCK_SIZE, PARQUET_DEFAULT_BLOCK_SIZE);
    options.put(ParquetOutputFormat.PAGE_SIZE, PARQUET_DEFAULT_PAGE_SIZE);
    options.put(ParquetOutputFormat.COMPRESSION, PARQUET_DEFAULT_COMPRESSION_CODEC_NAME);
    options.put(ParquetOutputFormat.ENABLE_DICTIONARY, PARQUET_DEFAULT_IS_DICTIONARY_ENABLED);
    options.put(ParquetOutputFormat.VALIDATION, PARQUET_DEFAULT_IS_VALIDATION_ENABLED);}return options;}
Example 8
Project: hadoop-arch-book   File: JavaSessionize.java View source code Vote up 5 votes
publicstaticvoid main(String[] args)throwsException{if(args.length ==0){System.err.println("Usage: JavaSessionize <master> [input file]");System.exit(1);}System.out.println("Output:"+ outputPath);JavaSparkContext jsc =newJavaSparkContext(args[0],"JavaSessionize",System.getenv("SPARK_HOME"),JavaSparkContext.jarOfClass(JavaSessionize.class));JavaRDD<String> dataSet =(args.length ==2)? jsc.textFile(args[1]): jsc.parallelize(testLines);// @formatter:offJavaPairRDD<String,SerializableLogLine> parsed = dataSet.map(newPairFunction<String,String,SerializableLogLine>(){// @formatter:on@OverridepublicTuple2<String,SerializableLogLine> call(String s)throwsException{returnnewTuple2<String,SerializableLogLine>(getIP(s),
                            getFields(s));}});// This groups clicks by IP addressJavaPairRDD<String,List<SerializableLogLine>> grouped = parsed
            .groupByKey();JavaPairRDD<String,List<SerializableLogLine>> sessionized = grouped
            .mapValues(newFunction<List<SerializableLogLine>,List<SerializableLogLine>>(){@OverridepublicList<SerializableLogLine> call
                        (List<SerializableLogLine> logLines)throwsException{return sessionize(logLines);}});

    sessionized.foreach(newVoidFunction<Tuple2<String,List<SerializableLogLine>>>(){@Overridepublicvoid call(Tuple2<String,List<SerializableLogLine>>
                                 stringListTuple2)throwsException{System.out.println("IP: "+ stringListTuple2._1());for(SerializableLogLine line : stringListTuple2._2()){System.out.println(line);}}});// right now sessionize is an RDD of pairs: <String,List<LogLine>>.// We want to output an RDD of <String,LogLine>// First, grab the Lists, then flatten them,// then pair them with something empty to make Hadoop happy// @formatter:offJavaRDD<List<SerializableLogLine>> nokeys = sessionized.map(newFunction<Tuple2<String,List<SerializableLogLine>>,List<SerializableLogLine>>(){// @formatter:on@OverridepublicList<SerializableLogLine> call(Tuple2<String,List<SerializableLogLine>> stringListTuple2)throwsException{return stringListTuple2._2();}});// @formatter:offJavaRDD<SerializableLogLine> flatLines = nokeys.flatMap(newFlatMapFunction<List<SerializableLogLine>,SerializableLogLine>(){// @formatter:on@OverridepublicIterable<SerializableLogLine> call
                (List<SerializableLogLine> serializableLogLines)throwsException{return serializableLogLines;}});JavaPairRDD<Void,SerializableLogLine> outputPairs = flatLines.map
            (newPairFunction<SerializableLogLine,Void,SerializableLogLine>(){@OverridepublicTuple2<Void,SerializableLogLine> call
                        (SerializableLogLine
                                 serializableLogLine)throwsException{returnnewTuple2<Void,SerializableLogLine>(null,
                            serializableLogLine);}});Job job =newJob();ParquetOutputFormat.setWriteSupportClass(job,AvroWriteSupport.class);AvroParquetOutputFormat.setSchema(job,LogLine.SCHEMA$);//dummy instance, because that's the only way to get the class of a// parameterized typeParquetOutputFormat<LogLine> pOutput =newParquetOutputFormat<LogLine>();//System.out.println("job write support - " +//        job.getConfiguration().get("parquet.write.support.class") +//        " job schema - " +  job.getConfiguration().get("parquet// .avro.schema"))  ;

    outputPairs.saveAsNewAPIHadoopFile(outputPath,//pathVoid.class,//key classLogLine.class,//value class
            pOutput.getClass(),//output format class
            job.getConfiguration());//configuration}
Example 9
Project: tajo   File: HiveCatalogStore.java View source code Vote up 5 votes
@Overridepublicfinalvoid createTable(finalCatalogProtos.TableDescProto tableDescProto){HiveCatalogStoreClientPool.HiveCatalogStoreClient client =null;TableDesc tableDesc =newTableDesc(tableDescProto);String[] splitted =CatalogUtil.splitFQTableName(tableDesc.getName());String databaseName = splitted[0];String tableName = splitted[1];try{
    client = clientPool.getClient();

    org.apache.hadoop.hive.metastore.api.Table table =new org.apache.hadoop.hive.metastore.api.Table();
    table.setDbName(databaseName);
    table.setTableName(tableName);
    table.setParameters(newHashMap<>(tableDesc.getMeta().getOptions().getAllKeyValus()));// TODO: set owner//table.setOwner();StorageDescriptor sd =newStorageDescriptor();
    sd.setSerdeInfo(newSerDeInfo());
    sd.getSerdeInfo().setParameters(newHashMap<>());
    sd.getSerdeInfo().setName(table.getTableName());//If tableType is a managed-table, the location is hive-warehouse dir// and it will be wrong path in output committing
    table.setTableType(TableType.EXTERNAL_TABLE.name());
    table.putToParameters("EXTERNAL","TRUE");Path tablePath =newPath(tableDesc.getUri());FileSystem fs = tablePath.getFileSystem(conf);if(fs.isFile(tablePath)){
      LOG.warn("A table path is a file, but HiveCatalogStore does not allow a file path.");
      sd.setLocation(tablePath.getParent().toString());}else{
      sd.setLocation(tablePath.toString());}// set column informationList<Column> columns = tableDesc.getSchema().getRootColumns();ArrayList<FieldSchema> cols =newArrayList<>(columns.size());for(Column eachField : columns){
      cols.add(newFieldSchema(eachField.getSimpleName(),HiveCatalogUtil.getHiveFieldType(eachField.getDataType()),""));}
    sd.setCols(cols);// set partition keysif(tableDesc.hasPartition()&& tableDesc.getPartitionMethod().getPartitionType().equals(PartitionType.COLUMN)){List<FieldSchema> partitionKeys =newArrayList<>();for(Column eachPartitionKey : tableDesc.getPartitionMethod().getExpressionSchema().getRootColumns()){
        partitionKeys.add(newFieldSchema(eachPartitionKey.getSimpleName(),HiveCatalogUtil.getHiveFieldType(eachPartitionKey.getDataType()),""));}
      table.setPartitionKeys(partitionKeys);}if(tableDesc.getMeta().getDataFormat().equalsIgnoreCase(BuiltinStorages.RCFILE)){StorageFormatDescriptor descriptor = storageFormatFactory.get(IOConstants.RCFILE);
      sd.setInputFormat(descriptor.getInputFormat());
      sd.setOutputFormat(descriptor.getOutputFormat());String serde = tableDesc.getMeta().getOption(StorageConstants.RCFILE_SERDE);if(StorageConstants.DEFAULT_TEXT_SERDE.equals(serde)){
        sd.getSerdeInfo().setSerializationLib(ColumnarSerDe.class.getName());}else{
        sd.getSerdeInfo().setSerializationLib(LazyBinaryColumnarSerDe.class.getName());}if(tableDesc.getMeta().getOptions().containsKey(StorageConstants.RCFILE_NULL)){
        table.putToParameters(serdeConstants.SERIALIZATION_NULL_FORMAT,StringEscapeUtils.unescapeJava(tableDesc.getMeta().getOption(StorageConstants.RCFILE_NULL)));}}elseif(tableDesc.getMeta().getDataFormat().equals(BuiltinStorages.TEXT)){// TextFileStorageFormatDescriptor has deprecated class. so the class name set directly
      sd.setInputFormat(TextInputFormat.class.getName());
      sd.setOutputFormat(HiveIgnoreKeyTextOutputFormat.class.getName());
      sd.getSerdeInfo().setSerializationLib(LazySimpleSerDe.class.getName());String fieldDelimiter = tableDesc.getMeta().getOption(StorageConstants.TEXT_DELIMITER,StorageConstants.DEFAULT_FIELD_DELIMITER);// User can use an unicode for filed delimiter such as \u0001, \001.// In this case, java console will convert this value into "\\u001".// And hive will un-espace this value again.// As a result, user can use right field delimiter.// So, we have to un-escape this value.
      sd.getSerdeInfo().putToParameters(serdeConstants.SERIALIZATION_FORMAT,StringEscapeUtils.unescapeJava(fieldDelimiter));
      sd.getSerdeInfo().putToParameters(serdeConstants.FIELD_DELIM,StringEscapeUtils.unescapeJava(fieldDelimiter));
      table.getParameters().remove(StorageConstants.TEXT_DELIMITER);if(tableDesc.getMeta().containsOption(StorageConstants.TEXT_NULL)){
        table.putToParameters(serdeConstants.SERIALIZATION_NULL_FORMAT,StringEscapeUtils.unescapeJava(tableDesc.getMeta().getOption(StorageConstants.TEXT_NULL)));
        table.getParameters().remove(StorageConstants.TEXT_NULL);}}elseif(tableDesc.getMeta().getDataFormat().equalsIgnoreCase(BuiltinStorages.SEQUENCE_FILE)){StorageFormatDescriptor descriptor = storageFormatFactory.get(IOConstants.SEQUENCEFILE);
      sd.setInputFormat(descriptor.getInputFormat());
      sd.setOutputFormat(descriptor.getOutputFormat());String serde = tableDesc.getMeta().getOption(StorageConstants.SEQUENCEFILE_SERDE);if(StorageConstants.DEFAULT_TEXT_SERDE.equals(serde)){
        sd.getSerdeInfo().setSerializationLib(LazySimpleSerDe.class.getName());String fieldDelimiter = tableDesc.getMeta().getOption(StorageConstants.SEQUENCEFILE_DELIMITER,StorageConstants.DEFAULT_FIELD_DELIMITER);// User can use an unicode for filed delimiter such as \u0001, \001.// In this case, java console will convert this value into "\\u001".// And hive will un-espace this value again.// As a result, user can use right field delimiter.// So, we have to un-escape this value.
        sd.getSerdeInfo().putToParameters(serdeConstants.SERIALIZATION_FORMAT,StringEscapeUtils.unescapeJava(fieldDelimiter));
        sd.getSerdeInfo().putToParameters(serdeConstants.FIELD_DELIM,StringEscapeUtils.unescapeJava(fieldDelimiter));
        table.getParameters().remove(StorageConstants.SEQUENCEFILE_DELIMITER);}else{
        sd.getSerdeInfo().setSerializationLib(LazyBinarySerDe.class.getName());}if(tableDesc.getMeta().containsOption(StorageConstants.SEQUENCEFILE_NULL)){
        table.putToParameters(serdeConstants.SERIALIZATION_NULL_FORMAT,StringEscapeUtils.unescapeJava(tableDesc.getMeta().getOption(StorageConstants.SEQUENCEFILE_NULL)));
        table.getParameters().remove(StorageConstants.SEQUENCEFILE_NULL);}}elseif(tableDesc.getMeta().getDataFormat().equalsIgnoreCase(BuiltinStorages.PARQUET)){StorageFormatDescriptor descriptor = storageFormatFactory.get(IOConstants.PARQUET);
      sd.setInputFormat(descriptor.getInputFormat());
      sd.setOutputFormat(descriptor.getOutputFormat());
      sd.getSerdeInfo().setSerializationLib(descriptor.getSerde());if(tableDesc.getMeta().containsOption(ParquetOutputFormat.COMPRESSION)){
        table.putToParameters(ParquetOutputFormat.COMPRESSION,
            tableDesc.getMeta().getOption(ParquetOutputFormat.COMPRESSION));}}else{thrownewUnsupportedException(tableDesc.getMeta().getDataFormat()+" in HivecatalogStore");}

    sd.setSortCols(newArrayList<>());

    table.setSd(sd);
    client.getHiveClient().createTable(table);}catch(Throwable t){thrownewTajoInternalError(t);}finally{if(client !=null) client.release();}}
Example 10
Project: clickstream-tutorial   File: JavaSessionize.java View source code Vote up 5 votes
publicstaticvoid main(String[] args)throwsException{if(args.length ==0){System.err.println("Usage: JavaSessionize <master> [input file]");System.exit(1);}String outputPath;if(args.length ==3){
        outputPath = args[2];}else{
        outputPath =newFile(temp,"output").getAbsolutePath();}System.out.println("Output:"+ outputPath);JavaSparkContext jsc =newJavaSparkContext(args[0],"JavaSessionize",System.getenv("SPARK_HOME"),JavaSparkContext.jarOfClass(JavaSessionize.class));JavaRDD<String> dataSet =(args.length ==2)? jsc.textFile(args[1]): jsc.parallelize(testLines);JavaPairRDD<String,SerializableLogLine> parsed = dataSet.mapToPair
            (newPairFunction<String,String,SerializableLogLine>(){@OverridepublicTuple2<String,SerializableLogLine> call(String s)throwsException{returnnewTuple2<String,SerializableLogLine>(getIP(s),
                            getFields(s));}});// This groups clicks by IP addressJavaPairRDD<String,Iterable<SerializableLogLine>> grouped = parsed
            .groupByKey();JavaPairRDD<String,Iterable<SerializableLogLine>> sessionized =
            grouped.mapValues(newFunction<Iterable<SerializableLogLine>,Iterable<SerializableLogLine>>(){@OverridepublicIterable<SerializableLogLine> call
                        (Iterable<SerializableLogLine> logLines)throwsException{return sessionize(logLines);}});

    sessionized.foreach(newVoidFunction<Tuple2<String,Iterable<SerializableLogLine>>>(){@Overridepublicvoid call(Tuple2<String,Iterable<SerializableLogLine>>
                                 stringListTuple2)throwsException{System.out.println("IP: "+ stringListTuple2._1());for(SerializableLogLine line : stringListTuple2._2()){System.out.println(line);}}});// right now sessionize is an RDD of pairs: <String,List<LogLine>>.// We want to output an RDD of <String,LogLine>// First, grab the Lists, then flatten them,// then pair them with something empty to make Hadoop happyJavaRDD<Iterable<SerializableLogLine>> nokeys = sessionized.map(newFunction<Tuple2<String,Iterable<SerializableLogLine>>,Iterable<SerializableLogLine>>(){@OverridepublicIterable<SerializableLogLine> call(Tuple2<String,Iterable<SerializableLogLine>> stringListTuple2)throwsException{return stringListTuple2._2();}});// @formatter:offJavaRDD<SerializableLogLine> flatLines = nokeys.flatMap(newFlatMapFunction<Iterable<SerializableLogLine>,SerializableLogLine>(){// @formatter:on@OverridepublicIterable<SerializableLogLine> call
                (Iterable<SerializableLogLine> serializableLogLines)throwsException{return serializableLogLines;}});JavaPairRDD<Void,SerializableLogLine> outputPairs = flatLines
            .mapToPair(newPairFunction<SerializableLogLine,Void,SerializableLogLine>(){@OverridepublicTuple2<Void,SerializableLogLine> call
                        (SerializableLogLine
                                 serializableLogLine)throwsException{returnnewTuple2<Void,SerializableLogLine>(null,
                            serializableLogLine);}});Job job =newJob();ParquetOutputFormat.setWriteSupportClass(job,AvroWriteSupport.class);AvroParquetOutputFormat.setSchema(job,LogLine.SCHEMA$);//dummy instance, because that's the only way to get the class of a// parameterized typeParquetOutputFormat<LogLine> pOutput =newParquetOutputFormat<LogLine>();//System.out.println("job write support - " +//        job.getConfiguration().get("parquet.write.support.class") +//        " job schema - " +  job.getConfiguration().get("parquet// .avro.schema"))  ;

    outputPairs.saveAsNewAPIHadoopFile(outputPath,//pathVoid.class,//key classLogLine.class,//value class
            pOutput.getClass(),//output format class
            job.getConfiguration());//configuration}
 
http://www.programcreek.com/java-api-examples/index.php?api=parquet.hadoop.ParquetOutputFormat
 

猜你喜欢

转载自kavy.iteye.com/blog/2296171
今日推荐