Descrição do Problema
Na versão 0.12.0 do hudi, tanto o flink quanto o spark podem realizar o gerenciamento de metadados com base no metastore do hive. Para obter mais informações, consulte: hudi HMS Catalog Guide . Ou seja, com base no catálogo Hudi HMS, depois que Flink cria a tabela, Flink ou Spark podem escrever, ou depois que Spark cria a tabela, Spark ou Flink podem escrever. No entanto, atualmente há um problema na versão Hudi 0.12.0. Depois de usar o catálogo flink HMS para criar uma tabela Hudi, o Spark SQL combinado com o catálogo Spark HMS para importar em lote os dados do hive não podem ser importados. O método de reprodução específico e a versão são os mesmos segue:
grave 0.12.0
grande 1.13.6
faísca 3.3.0
colmeia (HDP versão 3.1.4)
criação de tabela sql flink
create catalog hudi with(
'type' = 'hudi',
'mode' = 'hms',
'hive.conf.dir'='/etc/hive/conf'
);
--- 创建数据库供hudi使用
create database hudi.hudidb;
CREATE TABLE IF NOT EXISTS hudi.hudidb.store_returns_hudi_4(
sr_returned_date_sk BIGINT NOT NULL,
sr_return_time_sk BIGINT,
sr_item_sk BIGINT,
sr_customer_sk BIGINT,
sr_cdemo_sk BIGINT,
sr_hdemo_sk BIGINT,
sr_addr_sk BIGINT,
sr_store_sk BIGINT,
sr_reason_sk BIGINT,
sr_ticket_number BIGINT,
sr_return_quantity INT,
sr_return_amt DOUBLE,
sr_return_tax DOUBLE,
sr_return_amt_inc_tax DOUBLE,
sr_fee DOUBLE,
sr_return_ship_cost DOUBLE,
sr_refunded_cash DOUBLE,
sr_reversed_charge DOUBLE,
sr_store_credit DOUBLE,
sr_net_loss DOUBLE,
insert_time STRING,
PRIMARY KEY(sr_returned_date_sk) NOT ENFORCED
)
with (
'connector' = 'hudi',
'table.type' = 'MERGE_ON_READ',
'hive_sync.conf.dir' = '/etc/hive/conf',
'write.precombine.field' = 'sr_ticket_number',
'hoodie.datasource.write.hive_style_partitioning'='false',
'index.bootstrap.enabled' = 'true'
);
Crie tabelas e importe dados no hive:
CREATE TABLE store_returns (
sr_returned_date_sk BIGINT,
sr_return_time_sk BIGINT,
sr_item_sk BIGINT,
sr_customer_sk BIGINT,
sr_cdemo_sk BIGINT,
sr_hdemo_sk BIGINT,
sr_addr_sk BIGINT,
sr_store_sk BIGINT,
sr_reason_sk BIGINT,
sr_ticket_number BIGINT,
sr_return_quantity INT,
sr_return_amt DOUBLE,
sr_return_tax DOUBLE,
sr_return_amt_inc_tax DOUBLE,
sr_fee DOUBLE,
sr_return_ship_cost DOUBLE,
sr_refunded_cash DOUBLE,
sr_reversed_charge DOUBLE,
sr_store_credit DOUBLE,
sr_net_loss DOUBLE)
TBLPROPERTIES (
'bucketing_version' = '2',
'transient_lastDdlTime' = '1658370656');
insert into store_returns select 1,29496,26309,41563,325271,1894,196917,86,6,240001,43,79.55,4.77,84.32,63.0,212.42,62.84,0.83,15.88,280.19;
Importar dados usando spark sql
insert into hudidb.store_returns_hudi_4(
sr_returned_date_sk,
sr_return_time_sk,
sr_item_sk,
sr_customer_sk,
sr_cdemo_sk,
sr_hdemo_sk,
sr_addr_sk,
sr_store_sk,
sr_reason_sk,
sr_ticket_number,
sr_return_quantity,
sr_return_amt,
sr_return_tax,
sr_return_amt_inc_tax,
sr_fee,
sr_return_ship_cost,
sr_refunded_cash,
sr_reversed_charge,
sr_store_credit,
sr_net_loss,
insert_time
)
select sr_returned_date_sk,
sr_return_time_sk,
sr_item_sk,
sr_customer_sk,
sr_cdemo_sk,
sr_hdemo_sk,
sr_addr_sk,
sr_store_sk,
sr_reason_sk,
sr_ticket_number,
sr_return_quantity,
sr_return_amt,
sr_return_tax,
sr_return_amt_inc_tax,
sr_fee,
sr_return_ship_cost,
sr_refunded_cash,
sr_reversed_charge,
sr_store_credit,
sr_net_loss,cast(current_timestamp() as string);
O erro específico relatado é o seguinte:
Error: org.apache.hive.service.cli.HiveSQLException: Error running query: org.apache.spark.sql.AnalysisException: Cannot write incompatible data to table 'hudidb.store_returns_hudi_4':
- Cannot write nullable values to non-null column 'sr_returned_date_sk'
at org.apache.spark.sql.hive.thriftserver.HiveThriftServerErrors$.runningQueryError(HiveThriftServerErrors.scala:43)
at org.apache.spark.sql.hive.thriftserver.SparkExecuteStatementOperation.org$apache$spark$sql$hive$thriftserver$SparkExecuteStatementOperation$$execute(SparkExecuteStatementOperation.scala:325)
at org.apache.spark.sql.hive.thriftserver.SparkExecuteStatementOperation$$anon$2$$anon$3.$anonfun$run$2(SparkExecuteStatementOperation.scala:230)
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
at org.apache.spark.sql.hive.thriftserver.SparkOperation.withLocalProperties(SparkOperation.scala:79)
at org.apache.spark.sql.hive.thriftserver.SparkOperation.withLocalProperties$(SparkOperation.scala:63)
at org.apache.spark.sql.hive.thriftserver.SparkExecuteStatementOperation.withLocalProperties(SparkExecuteStatementOperation.scala:43)
at org.apache.spark.sql.hive.thriftserver.SparkExecuteStatementOperation$$anon$2$$anon$3.run(SparkExecuteStatementOperation.scala:230)
at org.apache.spark.sql.hive.thriftserver.SparkExecuteStatementOperation$$anon$2$$anon$3.run(SparkExecuteStatementOperation.scala:225)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:422)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1729)
at org.apache.spark.sql.hive.thriftserver.SparkExecuteStatementOperation$$anon$2.run(SparkExecuteStatementOperation.scala:239)
at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.sql.AnalysisException: Cannot write incompatible data to table 'hudidb.store_returns_hudi_4':
- Cannot write nullable values to non-null column 'sr_returned_date_sk'
at org.apache.spark.sql.errors.QueryCompilationErrors$.cannotWriteIncompatibleDataToTableError(QueryCompilationErrors.scala:1535)
at org.apache.spark.sql.catalyst.analysis.TableOutputResolver$.resolveOutputColumns(TableOutputResolver.scala:59)
at org.apache.spark.sql.HoodieSpark3CatalystPlanUtils.resolveOutputColumns(HoodieSpark3CatalystPlanUtils.scala:38)
at org.apache.spark.sql.hudi.command.InsertIntoHoodieTableCommand$.coerceQueryOutputColumns(InsertIntoHoodieTableCommand.scala:159)
at org.apache.spark.sql.hudi.command.InsertIntoHoodieTableCommand$.alignQueryOutput(InsertIntoHoodieTableCommand.scala:142)
at org.apache.spark.sql.hudi.command.InsertIntoHoodieTableCommand$.run(InsertIntoHoodieTableCommand.scala:99)
at org.apache.spark.sql.hudi.command.InsertIntoHoodieTableCommand.run(InsertIntoHoodieTableCommand.scala:60)
at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:75)
at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:73)
at org.apache.spark.sql.execution.command.ExecutedCommandExec.executeCollect(commands.scala:84)
at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.$anonfun$applyOrElse$1(QueryExecution.scala:98)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:109)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:169)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:95)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:779)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:98)
at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:94)
at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:584)
at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:176)
at org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:584)
at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:30)
at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267)
at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263)
at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:30)
at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:30)
at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:560)
at org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:94)
at org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:81)
at org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:79)
at org.apache.spark.sql.Dataset.<init>(Dataset.scala:220)
at org.apache.spark.sql.Dataset$.$anonfun$ofRows$2(Dataset.scala:100)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:779)
at org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:97)
at org.apache.spark.sql.SparkSession.$anonfun$sql$1(SparkSession.scala:622)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:779)
at org.apache.spark.sql.SparkSession.sql(SparkSession.scala:617)
at org.apache.spark.sql.SQLContext.sql(SQLContext.scala:651)
at org.apache.spark.sql.hive.thriftserver.SparkExecuteStatementOperation.org$apache$spark$sql$hive$thriftserver$SparkExecuteStatementOperation$$execute(SparkExecuteStatementOperation.scala:291)
... 16 more (state=,code=0)
analise de problemas
Analisando o código e verificando os atributos da tabela, descobrimos que o atributo anulável do campo sr_returned_date_sk no valor correspondente à configuração spark.sql.sources.schema.part.0 no metastore hive correspondente à tabela criada pelo flink é falso.Se a tabela acima for construída através do Spark, este atributo de campo é verdadeiro.
Pode-se julgar que quando o flink criou a tabela hudi no metastore do hive, houve um problema com os parâmetros construídos para o spark, ou seja, o correspondente
HoodieHiveCatalog.instantiateHiveTable em
serdeProperties.putAll(TableOptionProperties.translateFlinkTableProperties2Spark(catalogTable, hiveConf, propriedades, partiçãoKeys));
O método translateFlinkTableProperties2Spark é o seguinte
public static Map<String, String> translateFlinkTableProperties2Spark(
CatalogTable catalogTable,
Configuration hadoopConf,
Map<String, String> properties,
List<String> partitionKeys) {
Schema schema = AvroSchemaConverter.convertToSchema(catalogTable.getSchema().toPhysicalRowDataType().getLogicalType());
MessageType messageType = TableSchemaResolver.convertAvroSchemaToParquet(schema, hadoopConf);
String sparkVersion = catalogTable.getOptions().getOrDefault(SPARK_VERSION, DEFAULT_SPARK_VERSION);
Map<String, String> sparkTableProperties = SparkDataSourceTableUtils.getSparkTableProperties(
partitionKeys,
sparkVersion,
4000,
messageType);
properties.putAll(sparkTableProperties);
return properties.entrySet().stream()
.filter(e -> KEY_MAPPING.containsKey(e.getKey()) && !catalogTable.getOptions().containsKey(KEY_MAPPING.get(e.getKey())))
.collect(Collectors.toMap(e -> KEY_MAPPING.get(e.getKey()),
e -> e.getKey().equalsIgnoreCase(FlinkOptions.TABLE_TYPE.key()) ? VALUE_MAPPING.get(e.getValue()) : e.getValue()));
}
Portanto, podemos considerar alterar o atributo anulável do campo no valor correspondente a spark.sql.sources.schema.part.0 para true, ou seja, modificar o método acima da seguinte forma:
public static Map<String, String> translateFlinkTableProperties2Spark(
CatalogTable catalogTable,
Configuration hadoopConf,
Map<String, String> properties,
List<String> partitionKeys) {
Schema schema = AvroSchemaConverter.convertToSchema(catalogTable.getSchema().toPhysicalRowDataType().getLogicalType());
MessageType messageType = TableSchemaResolver.convertAvroSchemaToParquet(schema, hadoopConf);
String sparkVersion = catalogTable.getOptions().getOrDefault(SPARK_VERSION, DEFAULT_SPARK_VERSION);
Map<String, String> sparkTableProperties = SparkDataSourceTableUtils.getSparkTableProperties(
partitionKeys,
sparkVersion,
4000,
messageType);
properties.putAll(replaceSparkTableProperties(sparkTableProperties));
return properties.entrySet().stream()
.filter(e -> KEY_MAPPING.containsKey(e.getKey()) && !catalogTable.getOptions().containsKey(KEY_MAPPING.get(e.getKey())))
.collect(Collectors.toMap(e -> KEY_MAPPING.get(e.getKey()),
e -> e.getKey().equalsIgnoreCase(FlinkOptions.TABLE_TYPE.key()) ? VALUE_MAPPING.get(e.getValue()) : e.getValue()));
}
public static Map<String, String> replaceSparkTableProperties(Map<String, String> originProperties) {
for (String key : originProperties.keySet()) {
if (key.contains("spark.sql.sources.schema.part.")) {
originProperties.put(key, originProperties.get(key)
.replaceAll(" ", "").replaceAll("\"nullable\":false", "\"nullable\":true"));
}
}
return originProperties;
}