Build a data similarity system

1. Construction based on shared chips (high difficulty)

  After investigation, there is no open source tool to build shared code. For example, Bindiff can analyze the similarity threshold and code visualization between two samples, and the interface can perform similar analysis on samples in batches, but there is still a correlation between the two samples, which cannot be diversified. This association.

  Shared code definition: complete disassembly (filtering system API assembly code), with the intention of only retaining the virus itself compilation (which has nothing to do with the system and others), using algorithmic cutting (similar to Minhash cutting)

Value:

  Usage 1: Extract the shared codes of different families, generate code clusters and conclusions through algorithms, correlate the commonalities of different organization codes, and improve intelligence analysis (one of the methods currently used by many foreign intelligence and security companies).

  Usage 2: Take different families as the main nodes and build a shared code similarity system (matching search system) through code fragments.

  Usage 3: Shared code data can be provided to the algorithm team (if they have requirements) for data modeling and data mining.

2. Constructed based on MInhash (compromised difficulty)

  First try the PE format, such as sys, dll, exe to calculate different types of Minhash, or directly build the sample set Minhash.

Value:

  Usage 1: Set a threshold greater than 0.8 as similar samples, use BinDiff to compare and filter in batches or manually compare based on file types, analyze shared codes, and improve intelligence analysis.
emsp;
Effect picture between Hellsing and APT15 (samples within 100):

  • Hellsing sample set to generate database:
    Build a data similarity system

  • Enter the APT15 sample set to calculate the similarity threshold. If the following appears, it means there is no hit:
    Build a data similarity system
  • Use APT1 as Liezi to generate the database, after the hit is as follows:
    Build a data similarity system
#!/usr/bin/python

import argparse
import os
import murmur
import shelve
import sys
from numpy import *
from listing_5_1 import *

NUM_MINHASHES = 256
NUM_SKETCHES = 8

def wipe_database():
    dbpath = "/".join(__file__.split('/')[:-1] + ['samples.db'])
    os.system("rm -f {0}".format(dbpath))

def get_database():
    dbpath = "/".join(__file__.split('/')[:-1] + ['samples.db'])
    return shelve.open(dbpath,protocol=2,writeback=True)

def minhash(attributes):
    minhashes = []
    sketches = []
    for i in range(NUM_MINHASHES):
        minhashes.append(
            min([murmur.string_hash(`attribute`,i) for attribute in attributes])
        )
    for i in xrange(0,NUM_MINHASHES,NUM_SKETCHES):
        sketch = murmur.string_hash(`minhashes[i:i+NUM_SKETCHES]`)
        sketches.append(sketch)
    return array(minhashes),sketches

def store_sampledirectories(DirectPath):
    for root, dirs, paths in os.walk(DirectPath):
        for path in paths:
            db = get_database()
            attributes = getstrings(path)
            minhashes,sketches = minhash(attributes)
            neighbors = []

            for sketch in sketches:
                sketch = str(sketch)

                if not sketch in db:
                    continue

                for neighbor_path in db[sketch]:
                    neighbor_minhashes = db[neighbor_path]['minhashes']
                    similarity = (neighbor_minhashes == minhashes).sum() / float(NUM_MINHASHES)
                    neighbors.append((neighbor_path,similarity))

            neighbors = list(set(neighbors))
            neighbors.sort(key=lambda entry:entry[1],reverse=True)
            print ""
            print "Sample name".ljust(64),"Shared code estimate"
            for neighbor, similarity in neighbors:
                short_neighbor = neighbor.split("/")[-1]
                comments = db[neighbor]['comments']
                print str("[*] "+short_neighbor).ljust(64),similarity
                for comment in comments:
                    print "\t[comment]",comment

def store_sample(path):
    db = get_database()
    attributes = getstrings(path)
    minhashes,sketches = minhash(attributes)

    for sketch in sketches:
        sketch = str(sketch)
    if not sketch in db:
        db[sketch] = set([path])
    else:
        obj = db[sketch]
        obj.add(path)
        db[sketch] = obj
    db[path] = {'minhashes':minhashes,'comments':[]}
    db.sync()

    print "Extracted {0} attributes from {1} ...".format(len(attributes),path)

def comment_sample(path):
    db = get_database()
    comment = raw_input("Enter your comment:")
    if not path in db:
        store_sample(path)
    comments = db[path]['comments']
    comments.append(comment)
    db[path]['comments'] = comments
    db.sync()
    print "Stored comment:",comment

def search_sample(path):
    db = get_database()
    attributes = getstrings(path)
    minhashes,sketches = minhash(attributes)
    neighbors = []

    for sketch in sketches:
        sketch = str(sketch)

        if not sketch in db:
            continue

        for neighbor_path in db[sketch]:
            neighbor_minhashes = db[neighbor_path]['minhashes']
            similarity = (neighbor_minhashes == minhashes).sum() / float(NUM_MINHASHES)
            neighbors.append((neighbor_path,similarity))

    neighbors = list(set(neighbors))
    neighbors.sort(key=lambda entry:entry[1],reverse=True)
    print ""
    print "Sample name".ljust(64),"Shared code estimate"
    for neighbor, similarity in neighbors:
        short_neighbor = neighbor.split("/")[-1]
        comments = db[neighbor]['comments']
        print str("[*] "+short_neighbor).ljust(64),similarity
        for comment in comments:
            print "\t[comment]",comment

if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description="""
Simple code-sharing search system which allows you to build up a database of malware samples (indexed by file paths) and
then search for similar samples given some new sample
"""
    )

    parser.add_argument(
        "-l","--load",dest="load",default=None,
        help="Path to directory containing malware, or individual malware file, to store in database"
    )

    parser.add_argument(
        "-s","--search",dest="search",default=None,
        help="Individual malware file to perform similarity search on"
    )

    parser.add_argument(
        "-c","--comment",dest="comment",default=None,
        help="Comment on a malware sample path"
    )

    parser.add_argument(
        "-w","--wipe",action="store_true",default=False,
        help="Wipe sample database"
    )

    parser.add_argument(
    "-sd","--searchdir",dest="searchdir",default=None,
    help="Input Other APT VirusDirectior,Filter hit"
    )

    args = parser.parse_args()

    if len(sys.argv) == 1:
        parser.print_help()
    if args.load:
        malware_paths = [] # where we'll store the malware file paths
        malware_attributes = dict() # where we'll store the malware strings

        for root, dirs, paths in os.walk(args.load):
            # walk the target directory tree and store all of the file paths
            for path in paths:
                full_path = os.path.join(root,path)
                malware_paths.append(full_path)

        # filter out any paths that aren't PE files
        malware_paths = filter(pecheck, malware_paths)

        # get and store the strings for all of the malware PE files
        for path in malware_paths:
            store_sample(path)

    if args.search:
        search_sample(args.search)

    if args.searchdir:
    store_sampledirectories(args.searchdir)

    if args.comment:
        comment_sample(args.comment)

    if args.wipe:
        wipe_database()

3. Build based on other "feature bags" (low difficulty-not used)

问题(针对PE/ELF):

问题一:PE/ELF都会涉及加壳。

解决方案:压缩壳/有通用第三方脱壳工具可以调用接口脱壳,IAT加密/混淆/强壳直接丢弃(不参与Minhash计算和共享代码提取)。

问题二:MinHash算法是成熟/相对可靠,共享代码需要克服。

解决方案:自主实现,算法用成熟的python库尝试,通过优化不断提升。

Guess you like

Origin blog.51cto.com/13352079/2547061