Skip to content

Commit

Permalink
fix error in duplicate finder
Browse files Browse the repository at this point in the history
  • Loading branch information
Sg4Dylan committed May 12, 2020
1 parent 98743e3 commit a17ed96
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 15 deletions.
13 changes: 6 additions & 7 deletions gui/simple.ui
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
<rect>
<x>0</x>
<y>0</y>
<width>868</width>
<height>630</height>
<width>870</width>
<height>667</height>
</rect>
</property>
<property name="windowTitle">
Expand Down Expand Up @@ -120,7 +120,7 @@
<number>1000</number>
</property>
<property name="value">
<number>15</number>
<number>17</number>
</property>
</widget>
</item>
Expand Down Expand Up @@ -199,8 +199,7 @@
</font>
</property>
<property name="text">
<string>请注意:
依据不同状况查重过程可能需要耗费数秒至数分钟, 请耐心等待</string>
<string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;&lt;span style=&quot; font-size:11pt; color:#ff0000;&quot;&gt;注意:&lt;/span&gt;&lt;span style=&quot; font-size:11pt; color:#000000;&quot;&gt;依据不同状况查重过程可能需要耗费数秒至数分钟,请耐心等待&lt;/span&gt;&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
</property>
</widget>
</item>
Expand Down Expand Up @@ -243,7 +242,7 @@
<double>0.100000000000000</double>
</property>
<property name="value">
<double>95.000000000000000</double>
<double>98.500000000000000</double>
</property>
</widget>
</item>
Expand Down Expand Up @@ -362,7 +361,7 @@
</font>
</property>
<property name="text">
<string>注意:初次使用时,请添加至少一个索引图库路径,并更新索引记录</string>
<string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;&lt;span style=&quot; font-size:12pt; color:#ff0000;&quot;&gt;注意:&lt;/span&gt;&lt;span style=&quot; font-size:12pt;&quot;&gt;初次使用时,请添加至少一个索引图库路径,并更新索引记录&lt;/span&gt;&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
</property>
<property name="wordWrap">
<bool>true</bool>
Expand Down
23 changes: 15 additions & 8 deletions utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,18 +63,25 @@ def checkout(image_path, exists_index, match_n=5):
return [(sim[i], exists_index[ids[i]]) for i in range(len(ids))]


def get_duplicate(exists_index, threshold, match_n=30):
ret_ids = []
def get_duplicate(exists_index, threshold):
matched = set()
for idx in tqdm(range(len(exists_index)), ascii=True):
if idx in ret_ids:
continue
match_n = 5
try:
fv = ir_engine.hnsw_index.get_items([idx])[0]
except RuntimeError:
continue
sim, ids = ir_engine.match(fv, match_n)
while sim[-1] > threshold:
match_n = round(match_n*1.5)
sim, ids = ir_engine.match(fv, match_n)
for i in range(len(ids)):
if (sim[i] > threshold) and (ids[i] != idx) and (not idx in ret_ids):
ret_ids.append(ids[i])
ret_ids.append(idx)
yield (exists_index[idx], exists_index[ids[i]], sim[i])
if ids[i] == idx:
continue
if sim[i] < threshold:
continue
if ids[i] in matched:
continue
if not idx in matched:
matched.add(idx)
yield (exists_index[idx], exists_index[ids[i]], sim[i])

0 comments on commit a17ed96

Please sign in to comment.