Skip to content

Commit

Permalink
Refactor the command to clean duplicate submissions and add a unique …
Browse files Browse the repository at this point in the history
…together constraint
  • Loading branch information
rajpatel24 committed Sep 27, 2024
1 parent cdc71db commit c87f862
Show file tree
Hide file tree
Showing 3 changed files with 90 additions and 74 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -38,20 +38,12 @@ def add_arguments(self, parser):
help="Specify a XForm's `id_string` to clean up only this form",
)

parser.add_argument(
'--delete-unique-uuids',
action='store_true',
default=False,
help='Delete duplicates with identical uuid',
)

def handle(self, *args, **options):
username = options['user']
xform_id_string = options['xform']
self._delete_unique_uuids = options['delete_unique_uuids']
self._verbosity = options['verbosity']

# Retrieve all instances with the same `xml_hash`.
# Retrieve all instances with the same `uuid`
query = Instance.objects
if xform_id_string:
query = query.filter(xform__id_string=xform_id_string)
Expand All @@ -60,49 +52,47 @@ def handle(self, *args, **options):
query = query.filter(xform__user__username=username)

query = (
query.values_list('xml_hash', flat=True)
.annotate(count_xml_hash=Count('xml_hash'))
.filter(count_xml_hash__gt=1)
query.values_list('uuid', flat=True)
.annotate(count_uuid=Count('uuid'))
.filter(count_uuid__gt=1)
.distinct()
)

for xml_hash in query.iterator():
for uuid in query.iterator():
# Get all instances with the same UUID
duplicates_queryset = Instance.objects.filter(uuid=uuid)

duplicates_queryset = Instance.objects.filter(xml_hash=xml_hash)

instances_with_same_xml_hash = duplicates_queryset.values(
'id', 'uuid', 'xform_id'
instances = duplicates_queryset.values(
'id', 'uuid', 'xml_hash', 'xform_id', 'date_created'
).order_by('xform_id', 'uuid', 'date_created')

duplicates_by_xform = self._get_duplicates_by_xform(
instances_with_same_xml_hash
# Separate duplicates by their xml_hash (same and different)
same_xml_hash_duplicates, different_xml_hash_duplicates = (
self._get_duplicates_by_xml_hash(instances)
)

for (
xform_id,
instances_with_same_xml_hash,
) in duplicates_by_xform.items():
instance_ref = instances_with_same_xml_hash.pop(0)
self._clean_up(instance_ref, instances_with_same_xml_hash)

def _clean_up(self, instance_ref, duplicated_instances):

if duplicated_instances:

if self._replace_duplicates(duplicated_instances):
return

self._delete_duplicates(instance_ref, duplicated_instances)
# Handle the same xml_hash duplicates
if same_xml_hash_duplicates:
instance_ref = same_xml_hash_duplicates.pop(0)
self._delete_duplicates(
instance_ref, same_xml_hash_duplicates
)

def _delete_duplicates(
self, instance_ref: dict, duplicated_instances: list[dict]
):
# Handle the different xml_hash duplicates (update uuid)
if different_xml_hash_duplicates:
instance_ref = different_xml_hash_duplicates.pop(0)
self._replace_duplicates(different_xml_hash_duplicates)

def _delete_duplicates(self, instance_ref, duplicated_instances):
"""
Delete the duplicated instances with the same xml_hash and link their
attachments to the reference instance (instance_ref).
"""
duplicated_instance_ids = [i['id'] for i in duplicated_instances]

if self._verbosity >= 1:
self.stdout.write(
f"Deleting instance #{instance_ref['id']} duplicates…"
f"Deleting instance #{duplicated_instance_ids} duplicates…"
)

with transaction.atomic():
Expand All @@ -116,19 +106,18 @@ def _delete_duplicates(
)

# Update Mongo
main_instance = Instance.objects.get(
id=instance_ref['id']
)
main_instance = Instance.objects.get(id=instance_ref['id'])
main_instance.parsed_instance.save()

# Delete duplicated ParsedInstances
ParsedInstance.objects.filter(
instance_id__in=duplicated_instance_ids
).delete()

# Adjust counters and delete instances
instance_queryset = Instance.objects.filter(
id__in=duplicated_instance_ids
)
# update counters
for instance in instance_queryset.values(
'xform_id', 'date_created__date', 'xform__user_id'
):
Expand All @@ -154,42 +143,56 @@ def _delete_duplicates(
f'\tPurged instance IDs: {duplicated_instance_ids}'
)

def _replace_duplicates(self, duplicated_instances: list) -> bool:
uniq__uuids = set([i['uuid'] for i in duplicated_instances])

if len(uniq__uuids) > 1 or self._delete_unique_uuids:
return False

duplicates = []

def _replace_duplicates(self, duplicated_instances):
"""
Update the UUID of instances with different xml_hash values.
"""
instances_to_update = []
for idx, duplicated_instance in enumerate(duplicated_instances):
try:
instance = Instance.objects.get(pk=duplicated_instance['id'])
except Instance.DoesNotExist:
pass
else:
if self._verbosity > 1:
self.stdout.write(
f'\tUpdating instance #{instance.pk} ({instance.uuid})…'
)

instance.uuid = f'DUPLICATE {idx} {instance.uuid}'
instance.xml = set_meta(
instance.xml, 'instanceID', instance.uuid
continue

if self._verbosity > 1:
self.stdout.write(
f'\tUpdating instance #{instance.pk} ({instance.uuid})…'
)
instance.xml_hash = instance.get_hash(instance.xml)
duplicates.append(instance)

if duplicates:
Instance.objects.bulk_update(
duplicates, fields=['uuid', 'xml', 'xml_hash']
# Update the UUID and XML hash
instance.uuid = (f'DUPLICATE-{idx}-{instance.xform.id_string}-'
f'{instance.uuid}')
instance.xml = set_meta(
instance.xml, 'instanceID', instance.uuid
)
instance.xml_hash = instance.get_hash(instance.xml)
instances_to_update.append(instance)

return True
# Save the parsed instance to sync MongoDB
parsed_instance = instance.parsed_instance
parsed_instance.save()

def _get_duplicates_by_xform(self, queryset):
duplicates_by_xform = defaultdict(list)
for record in queryset:
duplicates_by_xform[record['xform_id']].append(record)
Instance.objects.bulk_update(
instances_to_update, ['uuid', 'xml', 'xml_hash']
)

def _get_duplicates_by_xml_hash(self, instances):
"""
Extract duplicates with the same xml_hash and different xml_hash
"""
same_xml_hash_duplicates = []
different_xml_hash_duplicates = []

xml_hash_groups = defaultdict(list)

# Group instances by their xml_hash
for instance in instances:
xml_hash_groups[instance['xml_hash']].append(instance)

for xml_hash, duplicates in xml_hash_groups.items():
if len(duplicates) > 1:
same_xml_hash_duplicates.extend(duplicates)
else:
different_xml_hash_duplicates.extend(duplicates)

return duplicates_by_xform
return same_xml_hash_duplicates, different_xml_hash_duplicates
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from django.conf import settings
from django.db import migrations, models
from django.db.models import UniqueConstraint


class Migration(migrations.Migration):
Expand All @@ -15,8 +16,13 @@ class Migration(migrations.Migration):
migrations.AddField(
model_name='instance',
name='root_uuid',
field=models.CharField(
db_index=True, max_length=249, null=True, unique=True
field=models.CharField(db_index=True, max_length=249, null=True),
),
# Add the unique constraint on root_uuid and xform
migrations.AddConstraint(
model_name='instance',
constraint=UniqueConstraint(
fields=['root_uuid', 'xform'], name='unique_root_uuid_xform'
),
),
]
9 changes: 8 additions & 1 deletion kobo/apps/openrosa/apps/logger/models/instance.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from django.contrib.gis.geos import GeometryCollection, Point
from django.utils import timezone
from django.utils.encoding import smart_str
from django.db.models import UniqueConstraint
from jsonfield import JSONField
from taggit.managers import TaggableManager

Expand Down Expand Up @@ -90,7 +91,7 @@ class Instance(AbstractTimeStampedModel):
# do not use it anymore.
deleted_at = models.DateTimeField(null=True, default=None)

root_uuid = models.CharField(max_length=249, null=True, unique=True, db_index=True)
root_uuid = models.CharField(max_length=249, null=True, db_index=True)
# ODK keeps track of three statuses for an instance:
# incomplete, submitted, complete
# we add a fourth status: submitted_via_web
Expand All @@ -110,6 +111,12 @@ class Instance(AbstractTimeStampedModel):

class Meta:
app_label = 'logger'
constraints = [
UniqueConstraint(
fields=['root_uuid', 'xform'],
name='unique_root_uuid_per_xform'
)
]

@property
def asset(self):
Expand Down

0 comments on commit c87f862

Please sign in to comment.