8000 Optimize DB requests in task list, quality conflicts list and cloudstorages by zhiltsov-max · Pull Request #8275 · cvat-ai/cvat · GitHub
[go: up one dir, main page]
More Web Proxy on the site http://driver.im/
Skip to content

Optimize DB requests in task list, quality conflicts list and cloudstorages #8275

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 32 commits into from
May 7, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
9efedb2
Optimize job viewset requests
zhiltsov-max Aug 7, 2024
4846625
Optimize project requests further
zhiltsov-max Aug 7, 2024
ddfee14
Avoid occasional org retrieval on field access
zhiltsov-max Aug 7, 2024
c4fb2cd
Remove extra fields from jobs query
zhiltsov-max Aug 7, 2024
dc192b4
Move some select_related to prefetch_related
zhiltsov-max Aug 7, 2024
9be5176
Optimize DB access in IAM checks
zhiltsov-max Aug 20, 2024
91f3057
Merge branch 'develop' into zm/optimize-viewsets
zhiltsov-max Jan 17, 2025
fcc456b
Remove duplicated import
zhiltsov-max Jan 17, 2025
d8f9223
Remove extra request in project serializer
zhiltsov-max Jan 20, 2025
c382cc4
Optimize inefficient related object id access in IAM checks
zhiltsov-max Jan 20, 2025
2864da1
Remove extra change
zhiltsov-max Jan 21, 2025
dd0375e
Remove extra assignment
zhiltsov-max Jan 21, 2025
0bebade
Remove extra code
zhiltsov-max Jan 22, 2025
e1c2ef5
Merge branch 'develop' into zm/optimize-viewsets
zhiltsov-max Jan 22, 2025
000537b
Merge branch 'zm/optimize-iam-data-access' into zm/optimize-viewsets
zhiltsov-max Jan 22, 2025
4e23622
Remove extra changes
zhiltsov-max Jan 22, 2025
ea02c54
Merge remote-tracking branch 'origin/zm/optimize-viewsets' into zm/op…
zhiltsov-max Jan 22, 2025
b0ed635
Remove extra changes 8000
zhiltsov-max Jan 22, 2025
47ba880
Remove extra changes
zhiltsov-max Jan 22, 2025
0390f7b
Optimize /api/quality/conflicts
zhiltsov-max Jan 24, 2025
c5cac64
Merge branch 'develop' into zm/optimize-viewsets
zhiltsov-max Apr 30, 2025
7984c36
Optimize task list
zhiltsov-max Apr 30, 2025
3b2a6e9
Optimize quality conflicts list
zhiltsov-max Apr 30, 2025
faaf196
Fix field access
zhiltsov-max Apr 30, 2025
8f1d6bb
Fix field access
zhiltsov-max Apr 30, 2025
f1460e3
Add comment
zhiltsov-max Apr 30, 2025
8e2e221
Be more specific about the optimizations in cloudstorage api
zhiltsov-max May 1, 2025
4039e4f
Improve cs manifest existence check
zhiltsov-max May 1, 2025
3d5c28a
Update changelog
zhiltsov-max May 1, 2025
c9a1fc1
Add comment
zhiltsov-max May 7, 2025
3b87dae
Refactor enum enumeration
zhiltsov-max May 7, 2025
04b1958
Restore all()
zhiltsov-max May 7, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
< 8000 /form>
5 changes: 5 additions & 0 deletions changelog.d/20250501_111634_mzhiltso_optimize_viewsets.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
### Fixed

- Improved performance of `GET /api/tasks`, `GET /api/quality/conflicts`
and `GET /api/cloudstorages` requests
(<https://github.com/cvat-ai/cvat/pull/8275>)
7 changes: 6 additions & 1 deletion cvat/apps/engine/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@
ZipCompressedChunkWriter,
load_image,
)
from cvat.apps.engine.model_utils import is_field_cached
from cvat.apps.engine.rq import RQMetaWithFailureInfo
from cvat.apps.engine.utils import (
CvatChunkTimestampMismatchError,
Expand Down Expand Up @@ -412,7 +413,11 @@ def get_or_set_task_chunk(
self._make_chunk_key(db_task, chunk_number, quality=quality),
set_callback,
)
db_task.refresh_from_db(fields=["segment_set"])

if is_field_cached(db_task, "segment_set"):
# Refresh segments to report actual dates if they were fetched previously
# Doing so without a check leads to an error if the related object is not prefetched
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FWIW, I think that's a bug, so I filed a Django bug report: https://code.djangoproject.com/ticket/36372.

db_task.refresh_from_db(fields=["segment_set"])

return self._to_data_with_mime(
self._validate_cache_item_timestamp(item, db_task.get_chunks_updated_date())
Expand Down
5 changes: 5 additions & 0 deletions cvat/apps/engine/model_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,11 @@ def is_prefetched(queryset: models.QuerySet, field: str) -> bool:
return field in queryset._prefetch_related_lookups


def is_field_cached(instance: models.Model, field: str) -> bool:
"Checks if a field is cached in the model instance"
return field in instance._state.fields_cache


_QuerysetT = TypeVar("_QuerysetT", bound=models.QuerySet)


Expand Down
38 changes: 22 additions & 16 deletions cvat/apps/engine/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -550,22 +550,28 @@ def __str__(self):
return self.name

class TaskQuerySet(models.QuerySet):
class JobSummaryFields(str, Enum):
total_jobs_count = "total_jobs_count"
completed_jobs_count = "completed_jobs_count"
validation_jobs_count = "validation_jobs_count"

def with_job_summary(self):
return self.prefetch_related(
'segment_set__job_set',
).annotate(
total_jobs_count=models.Count('segment__job', distinct=True),
completed_jobs_count=models.Count(
'segment__job',
filter=models.Q(segment__job__state=StateChoice.COMPLETED.value) &
models.Q(segment__job__stage=StageChoice.ACCEPTANCE.value),
distinct=True,
),
validation_jobs_count=models.Count(
'segment__job',
filter=models.Q(segment__job__stage=StageChoice.VALIDATION.value),
distinct=True,
)
Fields = self.JobSummaryFields
return self.annotate(
**{
Fields.total_jobs_count.value: models.Count('segment__job', distinct=True),
Fields.completed_jobs_count.value: models.Count(
'segment__job',
filter=models.Q(segment__job__state=StateChoice.COMPLETED.value) &
models.Q(segment__job__stage=StageChoice.ACCEPTANCE.value),
distinct=True,
),
Fields.validation_jobs_count.value: models.Count(
'segment__job',
filter=models.Q(segment__job__stage=StageChoice.VALIDATION.value),
distinct=True,
),
}
)

class Task(TimestampedModel, FileSystemRelatedModel):
Expand Down Expand Up @@ -1317,7 +1323,7 @@ def get_key_file_path(self):

@property
def has_at_least_one_manifest(self) -> bool:
return bool(self.manifests.count())
return self.manifests.exists()

class Storage(models.Model):
location = models.CharField(max_length=16, choices=Location.choices(), default=Location.LOCAL)
Expand Down
2 changes: 1 addition & 1 deletion cvat/apps/engine/permissions.py
Original file line number Diff line number Diff line change
Expand Up @@ -508,7 +508,7 @@ def create(cls, request: ExtendedRequest, view: ViewSet, obj: Task | None, iam_c
def create_scope_view(cls, request: ExtendedRequest, task: int | Task, iam_context: dict[str, Any] | None = None):
if isinstance(task, int):
try:
task = Task.objects.get(id=task)
task = Task.objects.select_related("organization").get(id=task)
except Task.DoesNotExist as ex:
raise ValidationError(str(ex))

Expand Down
28 changes: 28 additions & 0 deletions cvat/apps/engine/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2275,6 +2275,33 @@ def _create_files(self, instance, files):
[files_model(data=instance, **f) for f in files[files_type]]
)

class TaskReadListSerializer(serializers.ListSerializer):
def to_representation(self, data):
if isinstance(data, list) and data:
# Optimized prefetch only for the current page
page: list[models.Task] = data

# Annotate page objects
# We do it explicitly here and not in the LIST queryset to avoid
# doing the 9E81 same DB computations twice - one time for the page retrieval
# and another one for the COUNT(*) request to get the total count
page_task_ids = set(t.id for t in page)
job_summary_fields = [m.value for m in models.TaskQuerySet.JobSummaryFields]
job_counts = {
task["id"]: task
for task in models.Task.objects
.filter(id__in=page_task_ids)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I just thought of something WRT this and similar optimizations - how well does this behave when page_size is all and there are a lot of tasks?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

page_size=all must be allowed only in development deployments. It can crash the server because of OOM.

.with_job_summary()
.values("id", *job_summary_fields)
}

for task in page:
task_job_summary = job_counts.get(task.id)
for k in job_summary_fields:
setattr(task, k, task_job_summary[k])

return super().to_representation(data)

class TaskReadSerializer(serializers.ModelSerializer):
data_chunk_size = serializers.ReadOnlyField(source='data.chunk_size', required=False)
data_compressed_chunk_type = serializers.ReadOnlyField(source='data.compressed_chunk_type', required=False)
Expand Down Expand Up @@ -2313,6 +2340,7 @@ class Meta:
'organization': { 'allow_null': True },
'overlap': { 'allow_null': True },
}
list_serializer_class = TaskReadListSerializer

def get_consensus_enabled(self, instance: models.Task) -> bool:
return instance.consensus_replicas > 0
Expand Down
22 changes: 15 additions & 7 deletions cvat/apps/engine/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -903,16 +903,16 @@ class TaskViewSet(viewsets.GenericViewSet, mixins.ListModelMixin,
):
queryset = Task.objects.select_related(
'data',
'data__validation_layout',
'assignee',
'owner',
'target_storage',
'source_storage',
'annotation_guide',
).prefetch_related(
'segment_set__job_set',
'segment_set__job_set__assignee',
).with_job_summary()
# avoid loading heavy data in select related
# this reduces performance of the COUNT request in the list endpoint
'data__validation_layout',
)

lookup_fields = {
'project_name': 'project__name',
Expand Down Expand Up @@ -952,8 +952,11 @@ def get_queryset(self):
if self.action == 'list':
perm = TaskPermission.create_scope_list(self.request)
queryset = perm.filter(queryset)
# with_job_summary() is optimized in the serializer
elif self.action == 'preview':
queryset = Task.objects.select_related('data')
else:
queryset = queryset.with_job_summary()

return queryset

Expand Down Expand Up @@ -1825,8 +1828,12 @@ class JobViewSet(viewsets.GenericViewSet, mixins.ListModelMixin, mixins.CreateMo
mixins.RetrieveModelMixin, PartialUpdateModelMixin, mixins.DestroyModelMixin,
UploadMixin, DatasetMixin
):
queryset = Job.objects.select_related('assignee', 'segment__task__data',
'segment__task__project', 'segment__task__annotation_guide', 'segment__task__project__annotation_guide',
queryset = Job.objects.select_related(
'assignee',
'segment__task__data',
'segment__task__project',
'segment__task__annotation_guide',
'segment__task__project__annotation_guide',
)

iam_organization_field = 'segment__task__organization'
Expand Down Expand Up @@ -2750,7 +2757,7 @@ class CloudStorageViewSet(viewsets.GenericViewSet, mixins.ListModelMixin,
mixins.RetrieveModelMixin, mixins.CreateModelMixin, mixins.DestroyModelMixin,
PartialUpdateModelMixin
):
queryset = CloudStorageModel.objects.prefetch_related('data').all()
queryset = CloudStorageModel.objects.all()

search_fields = ('provider_type', 'name', 'resource',
'credentials_type', 'owner', 'description')
Expand All @@ -2777,6 +2784,7 @@ def get_queryset(self):
if self.action == 'list':
perm = CloudStoragePermission.create_scope_list(self.request)
queryset = perm.filter(queryset)
queryset = queryset.prefetch_related('owner', 'manifests')

provider_type = self.request.query_params.get('provider_type', None)
if provider_type:
Expand Down
27 changes: 7 additions & 20 deletions cvat/apps/quality_control/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,22 +62,7 @@
),
)
class QualityConflictsViewSet(viewsets.GenericViewSet, mixins.ListModelMixin):
queryset = (
AnnotationConflict.objects.select_related(
"report",
"report__parent",
"report__job",
"report__job__segment",
"report__job__segment__task",
"report__job__segment__task__organization",
"report__task",
"report__task__organization",
)
.prefetch_related(
"annotation_ids",
)
.all()
)
queryset = AnnotationConflict.objects.prefetch_related("annotation_ids")

iam_organization_field = [
"report__job__segment__task__organization",
Expand All @@ -103,16 +88,18 @@ def get_queryset(self):
# NOTE: This filter is too complex to be implemented by other means,
# it has a dependency on the report type
try:
report = QualityReport.objects.get(id=report_id)
report = QualityReport.objects.select_related(
"job__segment__task__organization",
"task__organization",
).get(id=report_id)
except QualityReport.DoesNotExist as ex:
raise NotFound(f"Report {report_id} does not exist") from ex

self.check_object_permissions(self.request, report)

if report.target == QualityReportTarget.TASK:
queryset = queryset.filter(
Q(report=report) | Q(report__parent=report)
).distinct()
# Task reports do not have own conflicts
queryset = queryset.filter(report__parent=report)
elif report.target == QualityReportTarget.JOB:
queryset = queryset.filter(report=report)
else:
Expand Down
Loading
0