From beca8ee0856e2ea16679c32d8fca803c7fd6db54 Mon Sep 17 00:00:00 2001 From: Kharec Date: Wed, 18 Feb 2026 03:56:44 +0100 Subject: [PATCH] perf(library): optimize paginated fetch with bounded concurrent scheduling --- auditui/library/client_fetch.py | 94 ++++++++++++++++++++++++--------- 1 file changed, 68 insertions(+), 26 deletions(-) diff --git a/auditui/library/client_fetch.py b/auditui/library/client_fetch.py index 4294aa5..5475deb 100644 --- a/auditui/library/client_fetch.py +++ b/auditui/library/client_fetch.py @@ -3,7 +3,6 @@ from __future__ import annotations from concurrent.futures import ThreadPoolExecutor, as_completed - from typing import Any from ..types import LibraryItem, StatusCallback @@ -76,39 +75,73 @@ class LibraryClientFetchMixin: if len(first_page_items) < page_size: return all_items + estimated_pages = self._estimate_total_pages( + library_response, page_size) + page_results = self._fetch_remaining_pages( + response_groups=response_groups, + page_size=page_size, + estimated_pages=estimated_pages, + initial_total=len(first_page_items), + on_progress=on_progress, + ) + + for page_num in sorted(page_results.keys()): + all_items.extend(page_results[page_num]) + + return all_items + + def _estimate_total_pages(self, library_response: dict, page_size: int) -> int: + """Estimate total pages from API metadata with a conservative cap.""" total_items_estimate = library_response.get( "total_results" ) or library_response.get("total") - if total_items_estimate: - estimated_pages = (total_items_estimate + page_size - 1) // page_size - estimated_pages = min(estimated_pages, 1000) - else: - estimated_pages = 500 + if not total_items_estimate: + return 500 + estimated_pages = (total_items_estimate + page_size - 1) // page_size + return min(estimated_pages, 1000) - max_workers = 50 + def _fetch_remaining_pages( + self, + response_groups: str, + page_size: int, + estimated_pages: int, + initial_total: int, + on_progress: StatusCallback | None = None, + ) -> dict[int, list[LibraryItem]]: + """Fetch pages 2..N with bounded in-flight requests for faster startup.""" page_results: dict[int, list[LibraryItem]] = {} + max_workers = min(16, max(1, estimated_pages - 1)) + next_page_to_submit = 2 + stop_page = estimated_pages + 1 + completed_count = 0 + total_items = initial_total with ThreadPoolExecutor(max_workers=max_workers) as executor: future_to_page: dict = {} - for page in range(2, estimated_pages + 1): + while ( + next_page_to_submit <= estimated_pages + and next_page_to_submit < stop_page + and len(future_to_page) < max_workers + ): future = executor.submit( - self._fetch_page, page, page_size, response_groups + self._fetch_page, + next_page_to_submit, + page_size, + response_groups, ) - future_to_page[future] = page + future_to_page[future] = next_page_to_submit + next_page_to_submit += 1 - completed_count = 0 - total_items = len(first_page_items) - - for future in as_completed(future_to_page): + while future_to_page: + future = next(as_completed(future_to_page)) + page_num = future_to_page.pop(future) try: fetched_page, items = future.result() - future_to_page.pop(future, None) - if not items or len(items) < page_size: - for remaining_future in list(future_to_page.keys()): - remaining_future.cancel() - break + except Exception: + continue + if items: page_results[fetched_page] = items total_items += len(items) completed_count += 1 @@ -116,12 +149,21 @@ class LibraryClientFetchMixin: on_progress( f"Fetched {completed_count} pages ({total_items} items)..." ) + if len(items) < page_size: + stop_page = min(stop_page, fetched_page) - except Exception: - future_to_page.pop(future, None) - pass + while ( + next_page_to_submit <= estimated_pages + and next_page_to_submit < stop_page + and len(future_to_page) < max_workers + ): + next_future = executor.submit( + self._fetch_page, + next_page_to_submit, + page_size, + response_groups, + ) + future_to_page[next_future] = next_page_to_submit + next_page_to_submit += 1 - for page_num in sorted(page_results.keys()): - all_items.extend(page_results[page_num]) - - return all_items + return page_results