-- 建立源表
create table t_source
(
item_id int,
created_time datetime,
modified_time datetime,
item_name varchar(20),
other varchar(20)
);
-- 建立目标表
create table t_target like t_source;
-- 生成100万测试数据,其中有50万created_time和item_name重复
delimiter //
create procedure sp_generate_data(http://www.amjmh.com)
begin
set @i := 1;
while @i<=500000 do
set @created_time := date_add('2017-01-01',interval @i second);
set @modified_time := @created_time;
set @item_name := concat('a',@i);
insert into t_source
values (I @, @ CREATED_TIME, modified_time @, @ ITEM_NAME, 'OTHER');
SET @i: I = @ +. 1;
End the while;
the commit;
SET @last_insert_id: = 500000;
INSERT INTO t_source
SELECT item_id + @last_insert_id,
CREATED_TIME,
DATE_ADD (modified_time, SECOND interval The @last_insert_id),
ITEM_NAME,
'OTHER'
from t_source;
the commit;
End
//
DELIMITER;
Call sp_generate_data ();
- the source table primary key or no uniqueness constraint, there may exist two identical the data, then insert a record simulate this situation.
insert into t_source select * from t_source where item_id = 1;
The source table with a record 1,000,001, the de-duplication destination table records should be 500,000.
mysql> select count(*),count(distinct created_time,item_name) from t_source;
+----------+----------------------------------------+
| count(*) | count(distinct created_time,item_name) |
+----------+----------------------------------------+
| 1000001 | 500000 |
+----------+----------------------------------------+
1 row in set (1.92 sec)
---------------------