sql server - How can I improve this query for deleting duplicates? -
i working on document management system. documents imported system. due error, of them imported twice. need delete duplicates. have document id previous system, can't delete documents associated multiple accounts , supposed in there twice, have check against well. associated values in different tables. have created following script come doc id's delete incredibly slow (it has been running 4 days on table less 2 million records).
declare @docidtodelete int declare @docid int declare @sourcedocid varchar(12) declare @taxid decimal(9,0) declare @account bigint select @docid = min(d.docid) docs d inner join contents c on d.docid = c.docid , c.folid=1 while @docid not null begin --get source document id document select @sourcedocid = val vtab0031 idxid=31 , docid=@docid -- see if there document same source document id select @docidtodelete = isnull(max(v.docid),0) vtab0031 v inner join contents c on v.docid = c.docid , c.folid=1 idxid=31 , val = @sourcedocid if @docid<@docidtodelete -- have possible duplicate lets check , see if matches on account begin select @account = val vtab0002 idxid=2 , docid=@docid select @docidtodelete = isnull(max(v.docid),0) vtab0002 v idxid=2 , val = @account , v.docid=@docidtodelete if @docid<@docidtodelete -- still have possible duplicate lets check , see if matches on taxid begin select @taxid = val vtab0006 idxid=6 , docid=@docid select @docidtodelete = isnull(max(v.docid),0) vtab0006 v idxid=6 , val = @taxid , v.docid = @docidtodelete if @docid<@docidtodelete -- still have match delete begin insert deletedduplicates values(@docidtodelete ,@docid) end end end select @docid = min(d.docid) docs d inner join contents c on d.docid = c.docid , c.folid=1 d.docid > @docid end
it's better use set operations rather procedural operations when working rdbms.
try instead:
select docidtodelete, docidtokeep deletedduplicates ( select max(docid) docidtodelete, min(docid) docidtokeep, sourcedocid, account, taxid, count(*) numbermatches ( select d.docid docid, s.val sourcedocid, a.val account, t.val taxid docs d inner join contents c on c.docid = d.docid inner join vtab0031 s on s.docid = d.docid inner join vtab0002 on a.docid = d.docid inner join vtab0006 t on t.docid = d.docid c.folid = 1 , s.idxid = 31 , a.idxid = 2 , t.idxid = 6 ) summary group sourcedocid, account, taxid having numbermatches > 1 ) duplicates update
i made new query should duplicate records. , should run more effeciently well, using indexes.
create table uniquedocuments ( docid int not null, sourcedocid varchar(12) not null, account bigint not null, taxid decimal(9,0) not null primary key clustered (sourcedocid, account, taxid) ) go insert uniquedocuments (docid, sourcedocid, account, taxid) select min(d.docid) docid, s.val sourcedocid, a.val account, t.val taxid docs d inner join contents c on c.docid = d.docid inner join vtab0031 s on s.docid = d.docid inner join vtab0002 on a.docid = d.docid inner join vtab0006 t on t.docid = d.docid c.folid = 1 , s.idxid = 31 , a.idxid = 2 , t.idxid = 6 group s.val, a.val, t.val insert deleteddocuments (docidtodelete, docidtokeep) select d.docid docidtodelete, ud.docid docidtokeep docs d inner join contents c on c.docid = d.docid inner join vtab0031 s on s.docid = d.docid inner join vtab0002 on a.docid = d.docid inner join vtab0006 t on t.docid = d.docid inner join uniquedocuments ud on ud.sourcedocid = s.val , ud.account = a.val , ud.taxid = t.val c.folid = 1 , s.idxid = 31 , a.idxid = 2 , t.idxid = 6 , d.docid <> ud.docid
Comments
Post a Comment