上文介绍了protobuffs的词法与语法分析过程,发现其收集了目标proto文件及其import文件的语法树,此处继续观察语义分析过程。
语法树的格式如下:
[Message1 = {message,MessageName,[FieldRecord1,FieldRecord2,...,FieldRecordn]},Message2,...,Messagen],
FieldRecord = {FieldID,required/optional/repeated/repeated_packed,FieldType,FieldName,DefaultValue}。
protobuffs_compile.erl
generate_source(ProtoFile,Options) when is_list (ProtoFile) ->
Basename = filename:basename(ProtoFile, ".proto") ++ "_pb",
{ok,FirstParsed} = parse(ProtoFile),
ImportPaths = ["./", "src/" | proplists:get_value(imports_dir, Options, [])],
Parsed = parse_imports(FirstParsed, ImportPaths),
Collected = collect_full_messages(Parsed),
Messages = resolve_types(Collected#collected.msg,Collected#collected.enum),
output_source (Basename, Messages, Collected#collected.enum, Options).
语义分析需要收集所有message、enum、extends定义,收集到所有的message域定义,enum定义,并根据extends定义对message进行扩展。
collect_full_messages(Data) -> collect_full_messages(Data, #collected{}).
collect_full_messages([{message, Name, Fields} | Tail], Collected) ->
ListName = case erlang:is_list (hd(Name)) of
true -> Name;
false -> [Name]
end,
FieldsOut = lists:foldl(
fun ({_,_,_,_,_} = Input, TmpAcc) -> [Input | TmpAcc];
(_, TmpAcc) -> TmpAcc
end, [], Fields),
Enums = lists:foldl(
fun ({enum,C,D}, TmpAcc) -> [{enum, [C | ListName], D} | TmpAcc];
(_, TmpAcc) -> TmpAcc
end, [], Fields),
Extensions = lists:foldl(
fun ({extensions, From, To}, TmpAcc) -> [{From,To}|TmpAcc];
(_, TmpAcc) -> TmpAcc
end, [], Fields),
SubMessages = lists:foldl(
fun ({message, C, D}, TmpAcc) -> [{message, [C | ListName], D} | TmpAcc];
(_, TmpAcc) -> TmpAcc
end, [], Fields),
NewCollected = Collected#collected{
msg=[{ListName, FieldsOut} | Collected#collected.msg],
extensions=[{ListName,Extensions} | Collected#collected.extensions]
},
collect_full_messages(Tail ++ SubMessages ++ Enums, NewCollected);
...
仍旧分析主角message的处理过程:
可以看到对message的处理主要包括取出一个message中的所有域信息、枚举定义、扩展定义、嵌套message,并将枚举定义和嵌套message合并到上级语法树中继续处理,而将域信息和扩展定义收集到一个collected结构中。
collect_full_messages([{enum, Name, Fields} | Tail], Collected) ->
ListName = case erlang:is_list (hd(Name)) of
true -> Name;
false -> [Name]
end,
FieldsOut = lists:foldl(
fun (Field, TmpAcc) ->
case Field of
{EnumAtom, IntValue} -> [{enum, type_path_to_type(ListName),
IntValue, EnumAtom} | TmpAcc];
_ -> TmpAcc
end
end, [], Fields),
NewCollected = Collected#collected{enum=FieldsOut++Collected#collected.enum},
collect_full_messages(Tail, NewCollected);
type_path_to_type (TypePath) ->
string:join (lists:reverse (TypePath), "_").
...
对enum的处理过程:
enum的处理比较简单,仅仅是提取所有的enum定义,并为其生成一条记录{enum,EnumName,EnumValue,EnumAtom},将枚举定义继续收集到collected结构中。
collect_full_messages([{extend, Name, ExtendedFields} | Tail], Collected) ->
ListName = case erlang:is_list (hd(Name)) of
true -> Name;
false -> [Name]
end,
CollectedMsg = Collected#collected.msg,
{ListName,FieldsOut} = lists:keyfind(ListName,1,CollectedMsg),
{ListName,Extensions} = lists:keyfind(ListName,1,Collected#collected.extensions),
FunNotInReservedRange = fun(Id) -> not(19000 =< Id andalso Id =< 19999) end,
FunInRange = fun(Id,From,max) -> From =< Id andalso Id =< 16#1fffffff;
(Id,From,To) -> From =< Id andalso Id =< To
end,
ExtendedFieldsOut = lists:append(FieldsOut,
lists:foldl(
fun ({Id, _, _, FieldName, _} = Input, TmpAcc) ->
case lists:any(fun({From,To}) -> FunNotInReservedRange(Id)
andalso FunInRange(Id,From,To)
end,Extensions) of
true ->
[Input | TmpAcc];
_ ->
error_logger:error_report(["Extended field not in valid range",
{message, Name},
{field_id,Id},
{field_name,FieldName},
{defined_ranges,Extensions},
{reserved_range,{19000,19999}},
{max,16#1fffffff}]),
throw(out_of_range)
end;
(_, TmpAcc) -> TmpAcc
end, [], ExtendedFields)
),
NewCollected = Collected#collected{msg=lists:keyreplace(ListName,1,CollectedMsg,{ListName,ExtendedFieldsOut})},
collect_full_messages(Tail, NewCollected);
对 extends的处理过程:
extends本质是一种依赖关系的体现,此处要检查extends是否满足message的extensions定义,同时也是对extends依赖的message是否存在的检查,然后将extends的域追加到原始message定义中,并替换collected中message的定义。
generate_source(ProtoFile,Options) when is_list (ProtoFile) ->
Basename = filename:basename(ProtoFile, ".proto") ++ "_pb",
{ok,FirstParsed} = parse(ProtoFile),
ImportPaths = ["./", "src/" | proplists:get_value(imports_dir, Options, [])],
Parsed = parse_imports(FirstParsed, ImportPaths),
Collected = collect_full_messages(Parsed),
Messages = resolve_types(Collected#collected.msg,Collected#collected.enum),
output_source (Basename, Messages, Collected#collected.enum, Options).
收集完了所有的message定义和enum定义,接着将进行所有message的类型解析了。
resolve_types (Data, Enums) -> resolve_types (Data, Data, Enums, []).
resolve_types ([{TypePath, Fields} | Tail], AllPaths, Enums, Acc) ->
FieldsOut = lists:foldl(
fun (Input, TmpAcc) ->
case Input of
{Index, Rules, Type, Identifier, Other} ->
case is_scalar_type (Type) of
true -> [Input | TmpAcc];
false ->
PossiblePaths =
case string:tokens (Type,".") of
[Type] ->
all_possible_type_paths (Type, TypePath);
FullPath ->
% handle types of the form Foo.Bar which are absolute,
% so we just convert to a type path and check it.
[lists:reverse (FullPath)]
end,
RealPath =
case find_type (PossiblePaths, AllPaths) of
false ->
case is_enum_type(Type, PossiblePaths, Enums) of
{true,EnumType} ->
EnumType;
false ->
throw (["Unknown Type ", Type])
end;
ResultType ->
ResultType
end,
[{Index, Rules, type_path_to_type (RealPath), Identifier, Other} | TmpAcc]
end;
_ -> TmpAcc
end
end, [], Fields),
resolve_types (Tail, AllPaths, Enums, [{type_path_to_type (TypePath), lists:reverse (FieldsOut) } | Acc]);
resolve_types ([], _, _, Acc) ->
Acc.
由于protocol buffers实质是一种对类型的描述,此处类型解析将是protobuffs的核心语义分析过程,对message的域进行类型检查和类型绑定,主要根据以下原则进行:
对于标量类型,也即protocol buffers的内建类型,无需过多检查;对于枚举类型和自定义类型,需要检查所依赖的类型,将其绑定到具体的枚举类型或自定义类型上。
至此,protobuffers的语义分析过程就结束了,这个过程比较简单,仅仅是类型收集、类型检查和类型绑定,此处已经收集到了类型的符号表:
[
Message1 = {MessageName,
[
Field1 = {FieldId,
FieldRule(required/optional/repeated/repeated_packed...),
FieldType(ScalarType, EnumType,OtherType),
FieldName,Other(DefaultValue)},
Field2,...,Fieldn]},
Message2,...Messagen,
Enum1 = {enum, EnumName, EnumValue, EnumAtom},
Enum2,...,Enumn
]。
未完待续...