sql server - How can I improve this query for deleting duplicates? -

- September 15, 2011

i working on document management system. documents imported system. due error, of them imported twice. need delete duplicates. have document id previous system, can't delete documents associated multiple accounts , supposed in there twice, have check against well. associated values in different tables. have created following script come doc id's delete incredibly slow (it has been running 4 days on table less 2 million records).

declare @docidtodelete int declare @docid int declare @sourcedocid varchar(12) declare @taxid decimal(9,0) declare @account bigint   select @docid = min(d.docid) docs d inner join contents c on d.docid = c.docid , c.folid=1 while @docid not null       begin             --get source document id document             select @sourcedocid = val vtab0031 idxid=31 , docid=@docid              -- see if there document same source document id             select @docidtodelete = isnull(max(v.docid),0) vtab0031 v             inner join contents c on v.docid = c.docid , c.folid=1             idxid=31 , val = @sourcedocid              if @docid<@docidtodelete -- have possible duplicate lets check , see if matches on account                   begin                         select @account = val vtab0002 idxid=2 , docid=@docid                         select @docidtodelete = isnull(max(v.docid),0) vtab0002 v                               idxid=2 , val = @account , v.docid=@docidtodelete                         if @docid<@docidtodelete -- still have possible duplicate lets check , see if matches on taxid                         begin                               select @taxid = val vtab0006 idxid=6 , docid=@docid                               select @docidtodelete = isnull(max(v.docid),0) vtab0006 v                                     idxid=6 , val = @taxid , v.docid = @docidtodelete                               if @docid<@docidtodelete -- still have match delete                                                                 begin                                     insert deletedduplicates values(@docidtodelete ,@docid)                                                                 end                         end                   end             select @docid = min(d.docid) docs d                   inner join contents c on d.docid = c.docid , c.folid=1                   d.docid > @docid       end

it's better use set operations rather procedural operations when working rdbms.

try instead:

select  docidtodelete,         docidtokeep    deletedduplicates (     select  max(docid) docidtodelete,             min(docid) docidtokeep,             sourcedocid,             account,             taxid,             count(*) numbermatches         (         select  d.docid docid,                 s.val sourcedocid,                 a.val account,                 t.val taxid            docs d                 inner join contents c on c.docid = d.docid                 inner join vtab0031 s on s.docid = d.docid                 inner join vtab0002 on a.docid = d.docid                 inner join vtab0006 t on t.docid = d.docid           c.folid = 1                 , s.idxid = 31                 , a.idxid = 2                 , t.idxid = 6     ) summary     group    sourcedocid,                 account,                 taxid     having  numbermatches > 1 ) duplicates

update

i made new query should duplicate records. , should run more effeciently well, using indexes.

create table uniquedocuments (     docid int not null,     sourcedocid varchar(12) not null,     account bigint not null,     taxid decimal(9,0) not null     primary key clustered (sourcedocid, account, taxid) ) go  insert uniquedocuments (docid, sourcedocid, account, taxid) select  min(d.docid) docid,         s.val sourcedocid,         a.val account,         t.val taxid    docs d         inner join contents c on c.docid = d.docid         inner join vtab0031 s on s.docid = d.docid         inner join vtab0002 on a.docid = d.docid         inner join vtab0006 t on t.docid = d.docid   c.folid = 1         , s.idxid = 31         , a.idxid = 2         , t.idxid = 6 group s.val,         a.val,         t.val  insert deleteddocuments (docidtodelete, docidtokeep) select  d.docid docidtodelete,         ud.docid docidtokeep    docs d         inner join contents c on c.docid = d.docid         inner join vtab0031 s on s.docid = d.docid         inner join vtab0002 on a.docid = d.docid         inner join vtab0006 t on t.docid = d.docid         inner join uniquedocuments ud on ud.sourcedocid = s.val                                          , ud.account = a.val                                          , ud.taxid = t.val   c.folid = 1         , s.idxid = 31         , a.idxid = 2         , t.idxid = 6         , d.docid <> ud.docid

Search This Blog

Score

sql server - How can I improve this query for deleting duplicates? -

Comments

Post a Comment

Popular posts from this blog

how to build hyperlink for query string in php -

php - What is the difference between $_SERVER['PATH_INFO'] and $_SERVER['ORIG_PATH_INFO']? -

queue - mq_receive: message too long -