Java Code Examples for parquet.hadoop.ParquetOutputFormat

The following are top voted examples for showing how to use parquet.hadoop.ParquetOutputFormat. These examples are extracted from open source projects. You can vote up the examples you like and your votes will be used in our system to product more good examples.

+ Save this class to your library

Example 1

Project: iow-hadoop-streaming File: ParquetAsTextOutputFormat.java View source code

Vote up

7 votes

privatestaticCompressionCodecName getCodec(JobConf conf){CompressionCodecName codec;if(ParquetOutputFormat.isCompressionSet(conf)){// explicit parquet config
          codec =ParquetOutputFormat.getCompression(conf);}elseif(getCompressOutput(conf)){// from hadoop config// find the right codecClass<?> codecClass = getOutputCompressorClass(conf,DefaultCodec.class);
          LOG.info("Compression set through hadoop codec: "+ codecClass.getName());
          codec =CompressionCodecName.fromCompressionCodec(codecClass);}else{
          codec =CompressionCodecName.UNCOMPRESSED;}

        LOG.info("Compression: "+ codec.name());return codec;}

Example 2

Project: tajo File: ParquetAppender.java View source code

Vote up

6 votes

/**
 * Creates a new ParquetAppender.
 *
 * @param conf Configuration properties.
 * @param schema The table schema.
 * @param meta The table metadata.
 * @param workDir The path of the Parquet file to write to.
 */publicParquetAppender(Configuration conf,TaskAttemptId taskAttemptId,Schema schema,TableMeta meta,Path workDir)throwsIOException{super(conf, taskAttemptId, schema, meta, workDir);this.blockSize =Integer.parseInt(
      meta.getOption(ParquetOutputFormat.BLOCK_SIZE,StorageConstants.PARQUET_DEFAULT_BLOCK_SIZE));this.pageSize =Integer.parseInt(
      meta.getOption(ParquetOutputFormat.PAGE_SIZE,StorageConstants.PARQUET_DEFAULT_PAGE_SIZE));this.compressionCodecName =CompressionCodecName.fromConf(
      meta.getOption(ParquetOutputFormat.COMPRESSION,StorageConstants.PARQUET_DEFAULT_COMPRESSION_CODEC_NAME));this.enableDictionary =Boolean.parseBoolean(
      meta.getOption(ParquetOutputFormat.ENABLE_DICTIONARY,StorageConstants.PARQUET_DEFAULT_IS_DICTIONARY_ENABLED));this.validating =Boolean.parseBoolean(
      meta.getOption(ParquetOutputFormat.VALIDATION,StorageConstants.PARQUET_DEFAULT_IS_VALIDATION_ENABLED));}

Example 3

Project: pbase File: TestInputOutputFormat.java View source code

Vote up

6 votes

privatevoid runMapReduceJob(CompressionCodecName codec,Map<String,String> extraConf)throwsIOException,ClassNotFoundException,InterruptedException{Configuration conf =newConfiguration(this.conf);for(Map.Entry<String,String> entry : extraConf.entrySet()){
        conf.set(entry.getKey(), entry.getValue());}finalFileSystem fileSystem = parquetPath.getFileSystem(conf);
    fileSystem.delete(parquetPath,true);
    fileSystem.delete(outputPath,true);{
        writeJob =newJob(conf,"write");TextInputFormat.addInputPath(writeJob, inputPath);
        writeJob.setInputFormatClass(TextInputFormat.class);
        writeJob.setNumReduceTasks(0);ParquetOutputFormat.setCompression(writeJob, codec);ParquetOutputFormat.setOutputPath(writeJob, parquetPath);
        writeJob.setOutputFormatClass(ParquetOutputFormat.class);
        writeJob.setMapperClass(readMapperClass);ParquetOutputFormat.setWriteSupportClass(writeJob,MyWriteSupport.class);GroupWriteSupport.setSchema(MessageTypeParser.parseMessageType(writeSchema),
                writeJob.getConfiguration());
        writeJob.submit();
        waitForJob(writeJob);}{
        conf.set(ReadSupport.PARQUET_READ_SCHEMA, readSchema);
        readJob =newJob(conf,"read");

        readJob.setInputFormatClass(ParquetInputFormat.class);ParquetInputFormat.setReadSupportClass(readJob,MyReadSupport.class);ParquetInputFormat.setInputPaths(readJob, parquetPath);
        readJob.setOutputFormatClass(TextOutputFormat.class);TextOutputFormat.setOutputPath(readJob, outputPath);
        readJob.setMapperClass(writeMapperClass);
        readJob.setNumReduceTasks(0);
        readJob.submit();
        waitForJob(readJob);}}

Example 4

Project: pbase File: CodecConfigTest.java View source code

Vote up

6 votes

publicvoid shouldUseParquetFlagToSetCodec(String codecNameStr,CompressionCodecName expectedCodec)throwsIOException{//Test mapreduce APIJob job =newJob();Configuration conf = job.getConfiguration();
        conf.set(ParquetOutputFormat.COMPRESSION, codecNameStr);TaskAttemptContext task =ContextUtil.newTaskAttemptContext(conf,newTaskAttemptID(newTaskID(newJobID("test",1),false,1),1));Assert.assertEquals(CodecConfig.from(task).getCodec(), expectedCodec);//Test mapred APIJobConf jobConf =newJobConf();
        jobConf.set(ParquetOutputFormat.COMPRESSION, codecNameStr);Assert.assertEquals(CodecConfig.from(jobConf).getCodec(), expectedCodec);}

Example 5

Project: pbase File: DeprecatedParquetOutputFormat.java View source code

Vote up

6 votes

publicRecordWriterWrapper(ParquetOutputFormat<V> realOutputFormat,FileSystem fs,JobConf conf,String name,Progressable progress)throwsIOException{CompressionCodecName codec = getCodec(conf);String extension = codec.getExtension()+".parquet";Path file = getDefaultWorkFile(conf, name, extension);try{
        realWriter =(ParquetRecordWriter<V>) realOutputFormat.getRecordWriter(conf, file, codec);}catch(InterruptedException e){Thread.interrupted();thrownewIOException(e);}}

Example 6

Project: tajo-cdh File: ParquetAppender.java View source code

Vote up

6 votes

/**
 * Creates a new ParquetAppender.
 *
 * @param conf Configuration properties.
 * @param schema The table schema.
 * @param meta The table metadata.
 * @param path The path of the Parquet file to write to.
 */publicParquetAppender(Configuration conf,Schema schema,TableMeta meta,Path path)throwsIOException{super(conf, schema, meta, path);this.blockSize =Integer.parseInt(
      meta.getOption(ParquetOutputFormat.BLOCK_SIZE));this.pageSize =Integer.parseInt(
      meta.getOption(ParquetOutputFormat.PAGE_SIZE));this.compressionCodecName =CompressionCodecName.fromConf(
      meta.getOption(ParquetOutputFormat.COMPRESSION));this.enableDictionary =Boolean.parseBoolean(
      meta.getOption(ParquetOutputFormat.ENABLE_DICTIONARY));this.validating =Boolean.parseBoolean(
      meta.getOption(ParquetOutputFormat.VALIDATION));}

Example 7

Project: tajo-cdh File: StorageUtil.java View source code

Vote up

6 votes

publicstaticOptions newPhysicalProperties(CatalogProtos.StoreType type){Options options =newOptions();if(CatalogProtos.StoreType.CSV == type){
    options.put(CSVFILE_DELIMITER, DEFAULT_FIELD_DELIMITER);}elseif(CatalogProtos.StoreType.RCFILE == type){
    options.put(RCFILE_SERDE, DEFAULT_BINARY_SERDE);}elseif(CatalogProtos.StoreType.SEQUENCEFILE == type){
    options.put(SEQUENCEFILE_SERDE, DEFAULT_TEXT_SERDE);
    options.put(SEQUENCEFILE_DELIMITER, DEFAULT_FIELD_DELIMITER);}elseif(type ==CatalogProtos.StoreType.PARQUET){
    options.put(ParquetOutputFormat.BLOCK_SIZE, PARQUET_DEFAULT_BLOCK_SIZE);
    options.put(ParquetOutputFormat.PAGE_SIZE, PARQUET_DEFAULT_PAGE_SIZE);
    options.put(ParquetOutputFormat.COMPRESSION, PARQUET_DEFAULT_COMPRESSION_CODEC_NAME);
    options.put(ParquetOutputFormat.ENABLE_DICTIONARY, PARQUET_DEFAULT_IS_DICTIONARY_ENABLED);
    options.put(ParquetOutputFormat.VALIDATION, PARQUET_DEFAULT_IS_VALIDATION_ENABLED);}return options;}

Example 8

Project: hadoop-arch-book File: JavaSessionize.java View source code

Vote up

5 votes

publicstaticvoid main(String[] args)throwsException{if(args.length ==0){System.err.println("Usage: JavaSessionize <master> [input file]");System.exit(1);}System.out.println("Output:"+ outputPath);JavaSparkContext jsc =newJavaSparkContext(args[0],"JavaSessionize",System.getenv("SPARK_HOME"),JavaSparkContext.jarOfClass(JavaSessionize.class));JavaRDD<String> dataSet =(args.length ==2)? jsc.textFile(args[1]): jsc.parallelize(testLines);// @formatter:offJavaPairRDD<String,SerializableLogLine> parsed = dataSet.map(newPairFunction<String,String,SerializableLogLine>(){// @formatter:on@OverridepublicTuple2<String,SerializableLogLine> call(String s)throwsException{returnnewTuple2<String,SerializableLogLine>(getIP(s),
                            getFields(s));}});// This groups clicks by IP addressJavaPairRDD<String,List<SerializableLogLine>> grouped = parsed
            .groupByKey();JavaPairRDD<String,List<SerializableLogLine>> sessionized = grouped
            .mapValues(newFunction<List<SerializableLogLine>,List<SerializableLogLine>>(){@OverridepublicList<SerializableLogLine> call
                        (List<SerializableLogLine> logLines)throwsException{return sessionize(logLines);}});

    sessionized.foreach(newVoidFunction<Tuple2<String,List<SerializableLogLine>>>(){@Overridepublicvoid call(Tuple2<String,List<SerializableLogLine>>
                                 stringListTuple2)throwsException{System.out.println("IP: "+ stringListTuple2._1());for(SerializableLogLine line : stringListTuple2._2()){System.out.println(line);}}});// right now sessionize is an RDD of pairs: <String,List<LogLine>>.// We want to output an RDD of <String,LogLine>// First, grab the Lists, then flatten them,// then pair them with something empty to make Hadoop happy// @formatter:offJavaRDD<List<SerializableLogLine>> nokeys = sessionized.map(newFunction<Tuple2<String,List<SerializableLogLine>>,List<SerializableLogLine>>(){// @formatter:on@OverridepublicList<SerializableLogLine> call(Tuple2<String,List<SerializableLogLine>> stringListTuple2)throwsException{return stringListTuple2._2();}});// @formatter:offJavaRDD<SerializableLogLine> flatLines = nokeys.flatMap(newFlatMapFunction<List<SerializableLogLine>,SerializableLogLine>(){// @formatter:on@OverridepublicIterable<SerializableLogLine> call
                (List<SerializableLogLine> serializableLogLines)throwsException{return serializableLogLines;}});JavaPairRDD<Void,SerializableLogLine> outputPairs = flatLines.map
            (newPairFunction<SerializableLogLine,Void,SerializableLogLine>(){@OverridepublicTuple2<Void,SerializableLogLine> call
                        (SerializableLogLine
                                 serializableLogLine)throwsException{returnnewTuple2<Void,SerializableLogLine>(null,
                            serializableLogLine);}});Job job =newJob();ParquetOutputFormat.setWriteSupportClass(job,AvroWriteSupport.class);AvroParquetOutputFormat.setSchema(job,LogLine.SCHEMA$);//dummy instance, because that's the only way to get the class of a// parameterized typeParquetOutputFormat<LogLine> pOutput =newParquetOutputFormat<LogLine>();//System.out.println("job write support - " +//        job.getConfiguration().get("parquet.write.support.class") +//        " job schema - " +  job.getConfiguration().get("parquet// .avro.schema"))  ;

    outputPairs.saveAsNewAPIHadoopFile(outputPath,//pathVoid.class,//key classLogLine.class,//value class
            pOutput.getClass(),//output format class
            job.getConfiguration());//configuration}

Example 9

Project: tajo File: HiveCatalogStore.java View source code

Vote up

5 votes

@Overridepublicfinalvoid createTable(finalCatalogProtos.TableDescProto tableDescProto){HiveCatalogStoreClientPool.HiveCatalogStoreClient client =null;TableDesc tableDesc =newTableDesc(tableDescProto);String[] splitted =CatalogUtil.splitFQTableName(tableDesc.getName());String databaseName = splitted[0];String tableName = splitted[1];try{
    client = clientPool.getClient();

    org.apache.hadoop.hive.metastore.api.Table table =new org.apache.hadoop.hive.metastore.api.Table();
    table.setDbName(databaseName);
    table.setTableName(tableName);
    table.setParameters(newHashMap<>(tableDesc.getMeta().getOptions().getAllKeyValus()));// TODO: set owner//table.setOwner();StorageDescriptor sd =newStorageDescriptor();
    sd.setSerdeInfo(newSerDeInfo());
    sd.getSerdeInfo().setParameters(newHashMap<>());
    sd.getSerdeInfo().setName(table.getTableName());//If tableType is a managed-table, the location is hive-warehouse dir// and it will be wrong path in output committing
    table.setTableType(TableType.EXTERNAL_TABLE.name());
    table.putToParameters("EXTERNAL","TRUE");Path tablePath =newPath(tableDesc.getUri());FileSystem fs = tablePath.getFileSystem(conf);if(fs.isFile(tablePath)){
      LOG.warn("A table path is a file, but HiveCatalogStore does not allow a file path.");
      sd.setLocation(tablePath.getParent().toString());}else{
      sd.setLocation(tablePath.toString());}// set column informationList<Column> columns = tableDesc.getSchema().getRootColumns();ArrayList<FieldSchema> cols =newArrayList<>(columns.size());for(Column eachField : columns){
      cols.add(newFieldSchema(eachField.getSimpleName(),HiveCatalogUtil.getHiveFieldType(eachField.getDataType()),""));}
    sd.setCols(cols);// set partition keysif(tableDesc.hasPartition()&& tableDesc.getPartitionMethod().getPartitionType().equals(PartitionType.COLUMN)){List<FieldSchema> partitionKeys =newArrayList<>();for(Column eachPartitionKey : tableDesc.getPartitionMethod().getExpressionSchema().getRootColumns()){
        partitionKeys.add(newFieldSchema(eachPartitionKey.getSimpleName(),HiveCatalogUtil.getHiveFieldType(eachPartitionKey.getDataType()),""));}
      table.setPartitionKeys(partitionKeys);}if(tableDesc.getMeta().getDataFormat().equalsIgnoreCase(BuiltinStorages.RCFILE)){StorageFormatDescriptor descriptor = storageFormatFactory.get(IOConstants.RCFILE);
      sd.setInputFormat(descriptor.getInputFormat());
      sd.setOutputFormat(descriptor.getOutputFormat());String serde = tableDesc.getMeta().getOption(StorageConstants.RCFILE_SERDE);if(StorageConstants.DEFAULT_TEXT_SERDE.equals(serde)){
        sd.getSerdeInfo().setSerializationLib(ColumnarSerDe.class.getName());}else{
        sd.getSerdeInfo().setSerializationLib(LazyBinaryColumnarSerDe.class.getName());}if(tableDesc.getMeta().getOptions().containsKey(StorageConstants.RCFILE_NULL)){
        table.putToParameters(serdeConstants.SERIALIZATION_NULL_FORMAT,StringEscapeUtils.unescapeJava(tableDesc.getMeta().getOption(StorageConstants.RCFILE_NULL)));}}elseif(tableDesc.getMeta().getDataFormat().equals(BuiltinStorages.TEXT)){// TextFileStorageFormatDescriptor has deprecated class. so the class name set directly
      sd.setInputFormat(TextInputFormat.class.getName());
      sd.setOutputFormat(HiveIgnoreKeyTextOutputFormat.class.getName());
      sd.getSerdeInfo().setSerializationLib(LazySimpleSerDe.class.getName());String fieldDelimiter = tableDesc.getMeta().getOption(StorageConstants.TEXT_DELIMITER,StorageConstants.DEFAULT_FIELD_DELIMITER);// User can use an unicode for filed delimiter such as \u0001, \001.// In this case, java console will convert this value into "\\u001".// And hive will un-espace this value again.// As a result, user can use right field delimiter.// So, we have to un-escape this value.
      sd.getSerdeInfo().putToParameters(serdeConstants.SERIALIZATION_FORMAT,StringEscapeUtils.unescapeJava(fieldDelimiter));
      sd.getSerdeInfo().putToParameters(serdeConstants.FIELD_DELIM,StringEscapeUtils.unescapeJava(fieldDelimiter));
      table.getParameters().remove(StorageConstants.TEXT_DELIMITER);if(tableDesc.getMeta().containsOption(StorageConstants.TEXT_NULL)){
        table.putToParameters(serdeConstants.SERIALIZATION_NULL_FORMAT,StringEscapeUtils.unescapeJava(tableDesc.getMeta().getOption(StorageConstants.TEXT_NULL)));
        table.getParameters().remove(StorageConstants.TEXT_NULL);}}elseif(tableDesc.getMeta().getDataFormat().equalsIgnoreCase(BuiltinStorages.SEQUENCE_FILE)){StorageFormatDescriptor descriptor = storageFormatFactory.get(IOConstants.SEQUENCEFILE);
      sd.setInputFormat(descriptor.getInputFormat());
      sd.setOutputFormat(descriptor.getOutputFormat());String serde = tableDesc.getMeta().getOption(StorageConstants.SEQUENCEFILE_SERDE);if(StorageConstants.DEFAULT_TEXT_SERDE.equals(serde)){
        sd.getSerdeInfo().setSerializationLib(LazySimpleSerDe.class.getName());String fieldDelimiter = tableDesc.getMeta().getOption(StorageConstants.SEQUENCEFILE_DELIMITER,StorageConstants.DEFAULT_FIELD_DELIMITER);// User can use an unicode for filed delimiter such as \u0001, \001.// In this case, java console will convert this value into "\\u001".// And hive will un-espace this value again.// As a result, user can use right field delimiter.// So, we have to un-escape this value.
        sd.getSerdeInfo().putToParameters(serdeConstants.SERIALIZATION_FORMAT,StringEscapeUtils.unescapeJava(fieldDelimiter));
        sd.getSerdeInfo().putToParameters(serdeConstants.FIELD_DELIM,StringEscapeUtils.unescapeJava(fieldDelimiter));
        table.getParameters().remove(StorageConstants.SEQUENCEFILE_DELIMITER);}else{
        sd.getSerdeInfo().setSerializationLib(LazyBinarySerDe.class.getName());}if(tableDesc.getMeta().containsOption(StorageConstants.SEQUENCEFILE_NULL)){
        table.putToParameters(serdeConstants.SERIALIZATION_NULL_FORMAT,StringEscapeUtils.unescapeJava(tableDesc.getMeta().getOption(StorageConstants.SEQUENCEFILE_NULL)));
        table.getParameters().remove(StorageConstants.SEQUENCEFILE_NULL);}}elseif(tableDesc.getMeta().getDataFormat().equalsIgnoreCase(BuiltinStorages.PARQUET)){StorageFormatDescriptor descriptor = storageFormatFactory.get(IOConstants.PARQUET);
      sd.setInputFormat(descriptor.getInputFormat());
      sd.setOutputFormat(descriptor.getOutputFormat());
      sd.getSerdeInfo().setSerializationLib(descriptor.getSerde());if(tableDesc.getMeta().containsOption(ParquetOutputFormat.COMPRESSION)){
        table.putToParameters(ParquetOutputFormat.COMPRESSION,
            tableDesc.getMeta().getOption(ParquetOutputFormat.COMPRESSION));}}else{thrownewUnsupportedException(tableDesc.getMeta().getDataFormat()+" in HivecatalogStore");}

    sd.setSortCols(newArrayList<>());

    table.setSd(sd);
    client.getHiveClient().createTable(table);}catch(Throwable t){thrownewTajoInternalError(t);}finally{if(client !=null) client.release();}}

Example 10

Project: clickstream-tutorial File: JavaSessionize.java View source code

Vote up

5 votes

publicstaticvoid main(String[] args)throwsException{if(args.length ==0){System.err.println("Usage: JavaSessionize <master> [input file]");System.exit(1);}String outputPath;if(args.length ==3){
        outputPath = args[2];}else{
        outputPath =newFile(temp,"output").getAbsolutePath();}System.out.println("Output:"+ outputPath);JavaSparkContext jsc =newJavaSparkContext(args[0],"JavaSessionize",System.getenv("SPARK_HOME"),JavaSparkContext.jarOfClass(JavaSessionize.class));JavaRDD<String> dataSet =(args.length ==2)? jsc.textFile(args[1]): jsc.parallelize(testLines);JavaPairRDD<String,SerializableLogLine> parsed = dataSet.mapToPair
            (newPairFunction<String,String,SerializableLogLine>(){@OverridepublicTuple2<String,SerializableLogLine> call(String s)throwsException{returnnewTuple2<String,SerializableLogLine>(getIP(s),
                            getFields(s));}});// This groups clicks by IP addressJavaPairRDD<String,Iterable<SerializableLogLine>> grouped = parsed
            .groupByKey();JavaPairRDD<String,Iterable<SerializableLogLine>> sessionized =
            grouped.mapValues(newFunction<Iterable<SerializableLogLine>,Iterable<SerializableLogLine>>(){@OverridepublicIterable<SerializableLogLine> call
                        (Iterable<SerializableLogLine> logLines)throwsException{return sessionize(logLines);}});

    sessionized.foreach(newVoidFunction<Tuple2<String,Iterable<SerializableLogLine>>>(){@Overridepublicvoid call(Tuple2<String,Iterable<SerializableLogLine>>
                                 stringListTuple2)throwsException{System.out.println("IP: "+ stringListTuple2._1());for(SerializableLogLine line : stringListTuple2._2()){System.out.println(line);}}});// right now sessionize is an RDD of pairs: <String,List<LogLine>>.// We want to output an RDD of <String,LogLine>// First, grab the Lists, then flatten them,// then pair them with something empty to make Hadoop happyJavaRDD<Iterable<SerializableLogLine>> nokeys = sessionized.map(newFunction<Tuple2<String,Iterable<SerializableLogLine>>,Iterable<SerializableLogLine>>(){@OverridepublicIterable<SerializableLogLine> call(Tuple2<String,Iterable<SerializableLogLine>> stringListTuple2)throwsException{return stringListTuple2._2();}});// @formatter:offJavaRDD<SerializableLogLine> flatLines = nokeys.flatMap(newFlatMapFunction<Iterable<SerializableLogLine>,SerializableLogLine>(){// @formatter:on@OverridepublicIterable<SerializableLogLine> call
                (Iterable<SerializableLogLine> serializableLogLines)throwsException{return serializableLogLines;}});JavaPairRDD<Void,SerializableLogLine> outputPairs = flatLines
            .mapToPair(newPairFunction<SerializableLogLine,Void,SerializableLogLine>(){@OverridepublicTuple2<Void,SerializableLogLine> call
                        (SerializableLogLine
                                 serializableLogLine)throwsException{returnnewTuple2<Void,SerializableLogLine>(null,
                            serializableLogLine);}});Job job =newJob();ParquetOutputFormat.setWriteSupportClass(job,AvroWriteSupport.class);AvroParquetOutputFormat.setSchema(job,LogLine.SCHEMA$);//dummy instance, because that's the only way to get the class of a// parameterized typeParquetOutputFormat<LogLine> pOutput =newParquetOutputFormat<LogLine>();//System.out.println("job write support - " +//        job.getConfiguration().get("parquet.write.support.class") +//        " job schema - " +  job.getConfiguration().get("parquet// .avro.schema"))  ;

    outputPairs.saveAsNewAPIHadoopFile(outputPath,//pathVoid.class,//key classLogLine.class,//value class
            pOutput.getClass(),//output format class
            job.getConfiguration());//configuration}

http://www.programcreek.com/java-api-examples/index.php?api=parquet.hadoop.ParquetOutputFormat

Java Code Examples for parquet.hadoop.ParquetOutputFormat

猜你喜欢