dexファイルの説明と分析

ずっと前に研究して、記録を作る

1.ファイルを010editorにドラッグします

ここに画像の説明を挿入
このdexファイルのさまざまな領域
が次の
とおりであることがわかりますdex_header、dexヘッダーファイル、主に各領域の変位、dex検証など
。dex_string_ids、dex文字列領域
dex_type_ids、dexタイプ領域
dex_proto_ids、これはなぜ使用するのかわかりません
dex_field_ids、
dexクラスのフィールド領域のコンテンツdex_method_ids、dexクラスの関数
dex_class_defs、dexクラスの分析
010editorに変位とサイズがあることがわかります。
では、dexパーサーをどのように記述しますか。

2.最初にdexヘッダーを解析し、ファイルがdexファイルであるかどうかを確認します

def is_dex(f):
    magic_code = f.read(4).decode()
    print(magic_code)
    if magic_code != 'dex\n':
        raise ValueError("not a valid dex file")
    dex_version = f.read(4)[:3].decode()
    return dex_version


def parse_dex_header(f):
    size_off_map = {
    
    }
    dex_version = is_dex(f)
    print("dex version", dex_version)
    checksum = hex(struct.unpack("<I", f.read(4))[0])
    print("check sum", checksum)
    signature = f.read(20).hex()
    print("signature", signature)
    file_size = struct.unpack("<I", f.read(4))[0]
    print("file size", file_size)
    header_size = struct.unpack("<I", f.read(4))[0]
    print("header size", header_size)
    # 0x12345678表示小端
    endian_tag = hex(struct.unpack("<I", f.read(4))[0])
    print("end tag", endian_tag)
    for name in ['link_size', 'link_off', 'map_off', 'string_ids_size', 'string_ids_off',
                 'type_ids_size', 'type_ids_off', 'proto_ids_size', 'proto_ids_off',
                 'field_ids_size', 'field_ids_off', 'method_ids_size', 'method_ids_off',
                 'class_defs_size', 'class_defs_off', 'data_size', 'data_off']:
        size_off_map[name] = struct.unpack('<I', f.read(4))[0]
    print(size_off_map)
    return size_off_map

3.プロセス全体

#!/usr/bin/env python
# @Time  : 2019-12-26 20:03

import struct


def read_uleb128(f):
    val = 0
    i = 0
    while True:
        b = f.read(1)[0]
        val |= ((b & 0x7f) << (i * 7))
        i += 1
        if b & 0x80 == 0:
            break
    return val


def is_dex(f):
    magic_code = f.read(4).decode()
    print(magic_code)
    if magic_code != 'dex\n':
        raise ValueError("not a valid dex file")
    dex_version = f.read(4)[:3].decode()
    return dex_version


def parse_dex_header(f):
    size_off_map = {
    
    }
    dex_version = is_dex(f)
    print("dex version", dex_version)
    checksum = hex(struct.unpack("<I", f.read(4))[0])
    print("check sum", checksum)
    signature = f.read(20).hex()
    print("signature", signature)
    file_size = struct.unpack("<I", f.read(4))[0]
    print("file size", file_size)
    header_size = struct.unpack("<I", f.read(4))[0]
    print("header size", header_size)
    # 0x12345678表示小端
    endian_tag = hex(struct.unpack("<I", f.read(4))[0])
    print("end tag", endian_tag)
    for name in ['link_size', 'link_off', 'map_off', 'string_ids_size', 'string_ids_off',
                 'type_ids_size', 'type_ids_off', 'proto_ids_size', 'proto_ids_off',
                 'field_ids_size', 'field_ids_off', 'method_ids_size', 'method_ids_off',
                 'class_defs_size', 'class_defs_off', 'data_size', 'data_off']:
        size_off_map[name] = struct.unpack('<I', f.read(4))[0]
    print(size_off_map)
    return size_off_map


def read_dex_str(f, off, size):
    text_info = []
    for i in range(size):
        f.seek(off + (i * 4))
        # 先读取字符串位移
        str_offset = struct.unpack("<I", f.read(4))[0]
        # 文件移动到该位置
        f.seek(str_offset)
        # uleb128读取字符串的大小
        utf16_size = read_uleb128(f)
        # 读取字符串内容
        string_data = f.read(utf16_size).decode()
        text_info.append({
    
    
            "text": string_data,
            "offset": str_offset,
            "text_size": utf16_size
        })
    print(text_info)
    return text_info


def parse_dex_types_ids(f, offset, size):
    type_ids = []
    for i in range(size):
        for i in range(size):
            f.seek(offset + (i * 4))
            # 先读取数据
            type_id = struct.unpack("<I", f.read(4))[0]
            type_ids.append(type_id)
    print(type_ids)
    return type_ids


def parse_proto_id(f, offset, size):
    proto_ids = []
    for i in range(size):
        idx_item = {
    
    }
        f.seek(offset + i * 4 * 3)
        idx = struct.unpack("<I", f.read(4))[0]
        type_idx = struct.unpack("<I", f.read(4))[0]
        params_off = struct.unpack("<I", f.read(4))[0]
        idx_item['idx'] = idx
        idx_item['type_idx'] = type_idx
        idx_item['params_off'] = params_off
        if params_off != 0:
            f.seek(params_off)
            params_size = struct.unpack("<I", f.read(4))[0]
            for i in range(params_size):
                info = struct.unpack("<H", f.read(2))[0]
                idx_item.setdefault("params", []).append(info)
        proto_ids.append(idx_item)
    print(proto_ids)


def parse_dex_field_ids(f, offset, size):
    field_indxs = []
    for i in range(size):
        f.seek(offset + 4 * 2 * size)
        class_idx = struct.unpack("<H", f.read(2))[0]
        type_indx = struct.unpack("<H", f.read(2))[0]
        name_indx = struct.unpack("<H", f.read(2))[0]
        field_indxs.append({
    
    
            'class_idx': class_idx,
            "type_indx": type_indx,
            "name_indx": name_indx
        })
    print(field_indxs)


def parse_method_idx(f, offset, size):
    method_idxs = []
    for i in range(size):
        f.seek(offset + 4 * 2 * i)
        class_idx = struct.unpack("<H", f.read(2))[0]
        proto_idx = struct.unpack("<H", f.read(2))[0]
        name_idx = struct.unpack("<I", f.read(4))[0]
        method_idxs.append({
    
    
            "class_idx": class_idx,
            "proto_idx": proto_idx,
            "name_idx": name_idx
        })
    print(method_idxs)


def read_annotations(f, off):
    # to be implemented
    f.seek(off)
    return {
    
    

    }


def read_class_data(f, off):
    f.seek(off)
    static_fields_size = read_uleb128(f)
    instance_fields_size = read_uleb128(f)
    direct_methods_size = read_uleb128(f)
    virtual_methods_size = read_uleb128(f)
    static_fields = list()
    for i in range(static_fields_size):
        static_fields.append({
    
    
            'field_ifx_diff': read_uleb128(f),
            'access_flags': read_uleb128(f)
        })
    instance_fields = list()
    for i in range(instance_fields_size):
        instance_fields.append({
    
    
            'field_ifx_diff': read_uleb128(f),
            'access_flags': read_uleb128(f)
        })
    direct_methods = list()
    for i in range(direct_methods_size):
        item = {
    
    
            'method_idx_diff': read_uleb128(f),
            'access_flags': read_uleb128(f),
            'code_off': read_uleb128(f)
        }
        current_offset = f.tell()
        if item['code_off'] != 0:
            item['code'] = read_code(f, item['code_off'])
            f.seek(current_offset)
        direct_methods.append(item)
    virtual_methods = list()
    for i in range(virtual_methods_size):
        item = {
    
    
            'method_idx_diff': read_uleb128(f),
            'access_flags': read_uleb128(f),
            'code_off': read_uleb128(f)
        }
        current_offset = f.tell()
        if item['code_off'] != 0:
            item['code'] = read_code(f, item['code_off'])
            f.seek(current_offset)
        virtual_methods.append(item)
    return {
    
    
        'static_fields_size': static_fields_size,
        'instance_fields_size': instance_fields_size,
        'direct_methods_size': direct_methods_size,
        'virtual_methods_size': virtual_methods_size,
        'static_fields': static_fields,
        'instance_fields': instance_fields,
        'direct_methods': direct_methods,
        'virtual_methods': virtual_methods
    }


def read_static_values(f, off):
    # to be implemented
    f.seek(off)
    return {
    
    

    }


def read_code(f, off):
    f.seek(off)
    item = {
    
    
        'registers_size': struct.unpack('<H', f.read(2))[0],
        'ins_size': struct.unpack('<H', f.read(2))[0],
        'outs_size': struct.unpack('<H', f.read(2))[0],
        'tries_size': struct.unpack('<H', f.read(2))[0],
        'debug_info_off': struct.unpack('<I', f.read(4))[0],
        'debug_info': dict(),
        'insns_size': struct.unpack('<I', f.read(4))[0],
        'insns': list()
    }
    for i in range(item['insns_size']):
        item['insns'].append(struct.unpack('<H', f.read(2))[0])
    f.seek(item['debug_info_off'])
    item['debug_info'] = {
    
    
        'line_start': read_uleb128(f),
        'parameters_size': read_uleb128(f),
        'opcode': list(),
    }
    for i in range(3):
        item['debug_info']['opcode'].append(f.read(1)[0])
    return item


def parse_class_def(f):
    item = {
    
    
        'class_idx': struct.unpack('<I', f.read(4))[0],
        'access_flags': struct.unpack('<I', f.read(4))[0],
        'superclass_idx': struct.unpack('<I', f.read(4))[0],
        'interfaces_off': struct.unpack('<I', f.read(4))[0],
        'source_file_idx': struct.unpack('<I', f.read(4))[0],
        'annotations_off': struct.unpack('<I', f.read(4))[0],
        'class_data_off': struct.unpack('<I', f.read(4))[0],
        'static_values_off': struct.unpack('<I', f.read(4))[0],
    }
    if item['annotations_off'] != 0:
        item['annotations'] = read_annotations(f, item['annotations_off'])
    if item['class_data_off'] != 0:
        item['class_data'] = read_class_data(f, item['class_data_off'])
    if item['static_values_off'] != 0:
        item['static_values'] = read_static_values(f, item['static_values_off'])
    return item


def parse_map(f, offset):
    map_data = []
    f.seek(offset)
    size = struct.unpack("<I", f.read(4))[0]
    print(size)
    for i in range(size):
        f.seek(map_offset + 4 * 3 * i)
        map_data.append({
    
    
            'type': struct.unpack("<H", f.read(2))[0],
            'unused': struct.unpack("<H", f.read(2))[0],
            'size': struct.unpack("<I", f.read(4))[0],
            'offset': struct.unpack("<I", f.read(4))[0],

        })
    print(map_data)


if __name__ == '__main__':
    with open("./classes.dex", 'rb') as f:
        parse_dex_headers_map = parse_dex_header(f)
        text_off, text_size = parse_dex_headers_map['string_ids_off'], parse_dex_headers_map['type_ids_size']
        read_dex_str(f, text_off, text_size)
        type_ids_off, type_ids_size = parse_dex_headers_map['type_ids_off'], parse_dex_headers_map['type_ids_size']
        parse_dex_types_ids(f, type_ids_off, type_ids_size)
        proto_ids_off, proto_ids_size = parse_dex_headers_map['proto_ids_off'], parse_dex_headers_map['proto_ids_size']
        parse_proto_id(f, proto_ids_off, proto_ids_size)
        field_ids_off, field_ids_size = parse_dex_headers_map['field_ids_off'], parse_dex_headers_map['field_ids_size']
        parse_dex_field_ids(f, field_ids_off, field_ids_size)
        method_ids_off, method_ids_size = parse_dex_headers_map['method_ids_off'], parse_dex_headers_map[
            'method_ids_size']
        parse_method_idx(f, method_ids_off, method_ids_size)
        class_defs_off, class_defs_size = parse_dex_headers_map['class_defs_off'], parse_dex_headers_map[
            'class_defs_size']
        parse_class_def(f)
        map_offset = parse_dex_headers_map['map_off']
        parse_map(f, map_offset)


おすすめ

転載: blog.csdn.net/esabeny/article/details/113106783