hive udf unique bigInt generator

1. Background

        MySQL data will be inserted faster due to the self-incrementing bigint primary key, because it can continue to be inserted at the end of the file, so this thing is needed.

        Then, the server has an interface dedicated to producing IDs, but batch insertion in the data center will definitely harass them, and we won't let us play together, we can only play by ourselves.

 

Second, the plan

        1. Redis gets the data segment, and the program increments itself.

        Problem: It is troublesome to use external redis, and it needs to be persistent

 

        2. The python service is registered, and this is searched online by yourself.

        Problem: we need a machine, a server and a client, and our python version is not high enough. . . Embarrassing, don't want to upgrade the entire cluster casually

 

 

        3. Twitter's  snowflake algorithm

         Reference:  https://www.jianshu.com/p/54a87a7c3622

 

         Parameters: workId datacenterId unique

  

         question:

          1. I can't be unique for both IDs. The MAP stage may be executed on the same machine at the same time, and the parameters are not easy to handle

          

          solution:

          1. Our scenario is that only the primary key of the same task is not repeated. Therefore , I select the ID of the map for workId. After all, the ID of each MAP must be different for the same task. datacenterId can default to 0 first. 

 

3. On the code:

      

public class MagicSnowFlake {

    //Actually timestamp 2017-01-01 00:00:00
    private final static long twepoch = 1483200000000l;

    // Change to 16-bit 65535, consider the maximum number of MAPs limited
    private final static long mapIdBits = 16L;

    private final static long ipIdMax = ~ (-1L << mapIdBits);

    // The default is 1 bit, we are small, there are not so many data centers, what is the meaning
    private final static long dataCenterIdBits = 1L;

    private final static long dataCenterIdMax = ~ (-1L << dataCenterIdBits);

    //The number of digits occupied by the sequence in the id is 12bit
    private final static long seqBits = 12L;

    //The maximum value of the sequence is 4095, which is 2 to the 12th power minus one.
    private final static long seqMax = ~(-1L << seqBits);

    // 64-bit number: the first 0 followed by 41 represents the timestamp MAP_ID and the last 12 serial number
    private final static long dataCenterIdLeftShift = seqBits;
    private final static long mapIdLeftShift = seqBits + dataCenterIdBits;
    private final static long timeLeftShift = seqBits  + dataCenterIdBits + mapIdLeftShift;

    //IP ID (0~255)
    private long ipId;

    // Data center ID (0~3)
    private long dataCenterId;

    // Sequence within milliseconds (0~4095)
    private long seq = 0L;

    // The last time the ID was generated
    private long lastTime = -1L;

    public MagicSnowFlake(long ipId, long dataCenterId) {
        if(ipId < 0 || ipId > ipIdMax) {
            System.out.println(" ---------- ipId is not within the normal range (0~"+ipIdMax +") " + ipId);
            System.exit(0);
        }

        if(dataCenterId < 0 || dataCenterId > dataCenterIdMax) {
            System.out.println(" ---------- dataCenterId is not within the normal range (0~"+dataCenterIdMax +") " + dataCenterId);
            System.exit(0);
        }

        this.ipId = ipId;
        this.dataCenterId = dataCenterId;
    }

    public synchronized long nextId() {
        long nowTime = System.currentTimeMillis();

        if(nowTime < lastTime) {
            System.out.println(" ---------- The current time is before the last operation time, the current time is wrong: " + nowTime);
            System.exit(0);
        }

        if(nowTime == lastTime) {
            seq = (seq + 1) & seqMax;
            if(seq == 0) {
                nowTime = getNextTimeStamp();
            }
        } else {
            seq = 0L;
        }

        lastTime = nowTime;


        return ((nowTime - twepoch) << timeLeftShift)
                | (ipId << mapIdLeftShift)
                | (dataCenterId << dataCenterIdLeftShift)
                | seq;
    }

    private long getNextTimeStamp() {
        long nowTime;
        do {
            nowTime = System.currentTimeMillis();
        } while(nowTime <= lastTime);
        return nowTime;
    }

    public static void main(String[] args) {
        System.out.println(Long.MAX_VALUE);
        MagicSnowFlake msf = new MagicSnowFlake(1, 1);
        msf.nextId();
        System.out.println(~ (-1L << 15));
    }
}

 

       

 

    UDF section

   

import org.apache.hadoop.hive.ql.exec.MapredContext;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.UDFType;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;

/**
 * @author <a href="mailto:[email protected]">火锅</a>
 * @time 18/3/8
 */
@UDFType(deterministic = false, stateful = true)
public class LongIdUDF extends GenericUDF {
    private static final char SEPARATOR = '_';
    private static final String ATTEMPT = "attempt";
    private long mapTaskId = 0l;
    private int increment = 0;

    private MagicSnowFlake snowFlake;



    @Override
    public void configure(MapredContext context) {
        increment = context.getJobConf().getNumMapTasks();
        if(increment == 0) {
            throw new IllegalArgumentException("mapred.map.tasks is zero");
        }

        mapTaskId = getInitId(context.getJobConf().get("mapred.task.id"),increment);
        if(mapTaskId == 0l) {
            throw new IllegalArgumentException("mapred.task.id");
        }
    }

    @Override
    public ObjectInspector initialize(ObjectInspector[] arguments)
            throws UDFArgumentException {
        return PrimitiveObjectInspectorFactory.javaLongObjectInspector;
    }

    @Override
    public Long evaluate(DeferredObject[] arguments) throws HiveException {
        if(snowFlake == null){
            int dataCenterId = Integer.parseInt(arguments[0].get().toString());
            snowFlake = new MagicSnowFlake(getMapTaskId(),dataCenterId);
        }
        return snowFlake.nextId();
    }

    @Override
    public String getDisplayString(String[] children) {
        return "getLongId(0)";
    }


    private synchronized long getMapTaskId() {
        return mapTaskId;
    }

    //attempt_1478926768563_0537_m_000004_0 // return 0+1
    private long getInitId (String taskAttemptIDstr,int numTasks)
            throws IllegalArgumentException {
        try {
            String[] parts = taskAttemptIDstr.split(Character.toString(SEPARATOR));
            if(parts.length == 6) {
                if(parts[0].equals(ATTEMPT)) {
                    if(!parts[3].equals("m") && !parts[3].equals("r")) {
                        throw new Exception();
                    }
                    long result = Long.parseLong(parts[4]);
                    if(result >= numTasks) { //if taskid >= numtasks
                        throw new Exception("TaskAttemptId string : " + taskAttemptIDstr  + "  parse ID [" + result + "] >= numTasks[" + numTasks + "] ..");
                    }
                    return result + 1;
                }
            }
        } catch (Exception e) {}
        throw new IllegalArgumentException("TaskAttemptId string : " + taskAttemptIDstr + " is  not properly formed");
    }


    public static void main(String[] args) {
        String s = "attempt_1478926768563_0537_m_000004_4";
        System.out.println(new LongIdUDF().getInitId(s,5));
    }

}

 

 

   summary:

          1. The code of the copy was modified by myself, I forgot the location. Generally speaking, it is from twitter.

          2. Tested with 3 billion tables, no repetition

          3. If you have any questions, please ask them in time.

 

Guess you like

Origin http://43.154.161.224:23101/article/api/json?id=326144324&siteId=291194637