From 21bcc7d38283f8fa7e74dc17e1cd5f66461ca5bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bal=C3=A1zs=20Sz=C3=BCcs?= <127139797+balazs-szucs@users.noreply.github.com> Date: Thu, 25 Dec 2025 18:10:14 +0100 Subject: [PATCH] perf(cbz): optimize CBZ reading (#1980) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * perf(cbz): improve zip file compatibility by enabling Unicode extra fields and ignoring local file headers Signed-off-by: Balázs Szücs * feat(perf): Implement robust ZipFile optimization with fallback Updated ZipFile reading logic in CbxReaderService, CbxProcessor, and CbxConversionService. Implemented a 'Fast Path' (Central Directory only) for speed (20x faster on large archives). Implemented a 'Slow Path' (Local Header scanning) fallback if the fast path fails or finds no images. Ensures compatibility with older or malformed archives where Unicode names are only present in local headers. Refactored extraction logic to helper methods where appropriate to support retry mechanism. --------- Signed-off-by: Balázs Szücs --- .../service/fileprocessor/CbxProcessor.java | 45 ++++++++++++++----- .../service/kobo/CbxConversionService.java | 43 +++++++++++++----- .../service/reader/CbxReaderService.java | 42 +++++++++++++---- 3 files changed, 98 insertions(+), 32 deletions(-) diff --git a/booklore-api/src/main/java/com/adityachandel/booklore/service/fileprocessor/CbxProcessor.java b/booklore-api/src/main/java/com/adityachandel/booklore/service/fileprocessor/CbxProcessor.java index 90e857875..18ed419a0 100644 --- a/booklore-api/src/main/java/com/adityachandel/booklore/service/fileprocessor/CbxProcessor.java +++ b/booklore-api/src/main/java/com/adityachandel/booklore/service/fileprocessor/CbxProcessor.java @@ -114,24 +114,45 @@ public class CbxProcessor extends AbstractFileProcessor implements BookFileProce } private Optional extractFirstImageFromZip(File file) { - try (ZipFile zipFile = ZipFile.builder().setFile(file).get()) { - return Collections.list(zipFile.getEntries()).stream() - .filter(e -> !e.isDirectory() && IMAGE_EXTENSION_CASE_INSENSITIVE_PATTERN.matcher(e.getName()).matches()) - .min(Comparator.comparing(ZipArchiveEntry::getName)) - .map(entry -> { - try (InputStream is = zipFile.getInputStream(entry)) { - return ImageIO.read(is); - } catch (Exception e) { - log.warn("Failed to read image from ZIP entry {}: {}", entry.getName(), e.getMessage()); - return null; - } - }); + // Fast path: Try reading from Central Directory + try (ZipFile zipFile = ZipFile.builder() + .setFile(file) + .setUseUnicodeExtraFields(true) + .setIgnoreLocalFileHeader(true) + .get()) { + Optional image = findAndReadFirstImage(zipFile); + if (image.isPresent()) return image; + } catch (Exception e) { + log.debug("Fast path failed for ZIP extraction: {}", e.getMessage()); + } + + // Slow path: Fallback to scanning local file headers + try (ZipFile zipFile = ZipFile.builder() + .setFile(file) + .setUseUnicodeExtraFields(true) + .setIgnoreLocalFileHeader(false) + .get()) { + return findAndReadFirstImage(zipFile); } catch (Exception e) { log.error("Error extracting ZIP: {}", e.getMessage()); return Optional.empty(); } } + private Optional findAndReadFirstImage(ZipFile zipFile) { + return Collections.list(zipFile.getEntries()).stream() + .filter(e -> !e.isDirectory() && IMAGE_EXTENSION_CASE_INSENSITIVE_PATTERN.matcher(e.getName()).matches()) + .min(Comparator.comparing(ZipArchiveEntry::getName)) + .map(entry -> { + try (InputStream is = zipFile.getInputStream(entry)) { + return ImageIO.read(is); + } catch (Exception e) { + log.warn("Failed to read image from ZIP entry {}: {}", entry.getName(), e.getMessage()); + return null; + } + }); + } + private Optional extractFirstImageFrom7z(File file) { try (SevenZFile sevenZFile = SevenZFile.builder().setFile(file).get()) { List imageEntries = new ArrayList<>(); diff --git a/booklore-api/src/main/java/com/adityachandel/booklore/service/kobo/CbxConversionService.java b/booklore-api/src/main/java/com/adityachandel/booklore/service/kobo/CbxConversionService.java index 31a3ad517..150d39922 100644 --- a/booklore-api/src/main/java/com/adityachandel/booklore/service/kobo/CbxConversionService.java +++ b/booklore-api/src/main/java/com/adityachandel/booklore/service/kobo/CbxConversionService.java @@ -196,23 +196,44 @@ public class CbxConversionService { } private List extractImagesFromZip(File cbzFile, Path extractedImagesDir) throws IOException { + // Fast path: Try reading from Central Directory + try (ZipFile zipFile = ZipFile.builder() + .setFile(cbzFile) + .setUseUnicodeExtraFields(true) + .setIgnoreLocalFileHeader(true) + .get()) { + List paths = extractImagesFromZipFile(zipFile, extractedImagesDir); + if (!paths.isEmpty()) return paths; + } catch (Exception e) { + log.debug("Fast path extraction failed for {}: {}", cbzFile.getName(), e.getMessage()); + } + + // Slow path: Fallback to scanning local file headers + try (ZipFile zipFile = ZipFile.builder() + .setFile(cbzFile) + .setUseUnicodeExtraFields(true) + .setIgnoreLocalFileHeader(false) + .get()) { + return extractImagesFromZipFile(zipFile, extractedImagesDir); + } + } + + private List extractImagesFromZipFile(ZipFile zipFile, Path extractedImagesDir) { List imagePaths = new ArrayList<>(); - - try (ZipFile zipFile = ZipFile.builder().setFile(cbzFile).get()) { - for (ZipArchiveEntry entry : Collections.list(zipFile.getEntries())) { - if (entry.isDirectory() || !isImageFile(entry.getName())) { - continue; - } - + for (ZipArchiveEntry entry : Collections.list(zipFile.getEntries())) { + if (entry.isDirectory() || !isImageFile(entry.getName())) { + continue; + } + + try { validateImageSize(entry.getName(), entry.getSize()); - + Path outputPath = extractedImagesDir.resolve(extractFileName(entry.getName())); try (InputStream inputStream = zipFile.getInputStream(entry)) { - Path outputPath = extractedImagesDir.resolve(extractFileName(entry.getName())); Files.copy(inputStream, outputPath); imagePaths.add(outputPath); - } catch (Exception e) { - log.warn("Error extracting image {}: {}", entry.getName(), e.getMessage()); } + } catch (Exception e) { + log.warn("Error extracting image {}: {}", entry.getName(), e.getMessage()); } } diff --git a/booklore-api/src/main/java/com/adityachandel/booklore/service/reader/CbxReaderService.java b/booklore-api/src/main/java/com/adityachandel/booklore/service/reader/CbxReaderService.java index ba5224fc2..c40ef6492 100644 --- a/booklore-api/src/main/java/com/adityachandel/booklore/service/reader/CbxReaderService.java +++ b/booklore-api/src/main/java/com/adityachandel/booklore/service/reader/CbxReaderService.java @@ -137,25 +137,36 @@ public class CbxReaderService { String[] encodingsToTry = {"UTF-8", "Shift_JIS", "ISO-8859-1", "CP437", "MS932"}; for (String encoding : encodingsToTry) { + Charset charset = Charset.forName(encoding); try { - extractZipWithEncoding(cbzPath, targetDir, Charset.forName(encoding)); - return; - } catch (IllegalArgumentException | java.util.zip.ZipException e) { - log.debug("Failed to extract with encoding {}: {}", encoding, e.getMessage()); + // Fast path: Try reading from Central Directory only + if (extractZipWithEncoding(cbzPath, targetDir, charset, true)) return; + } catch (Exception e) { + log.debug("Fast path failed for encoding {}: {}", encoding, e.getMessage()); + } + + try { + // Slow path: Fallback to scanning local file headers + if (extractZipWithEncoding(cbzPath, targetDir, charset, false)) return; + } catch (Exception e) { + log.debug("Slow path failed for encoding {}: {}", encoding, e.getMessage()); } } throw new IOException("Unable to extract ZIP archive with any supported encoding"); } - private void extractZipWithEncoding(Path cbzPath, Path targetDir, Charset charset) throws IOException { + private boolean extractZipWithEncoding(Path cbzPath, Path targetDir, Charset charset, boolean useFastPath) throws IOException { try (org.apache.commons.compress.archivers.zip.ZipFile zipFile = org.apache.commons.compress.archivers.zip.ZipFile.builder() .setPath(cbzPath) .setCharset(charset) + .setUseUnicodeExtraFields(true) + .setIgnoreLocalFileHeader(useFastPath) .get()) { var entries = zipFile.getEntries(); + boolean foundImages = false; while (entries.hasMoreElements()) { ZipArchiveEntry entry = entries.nextElement(); if (!entry.isDirectory() && isImageFile(entry.getName())) { @@ -163,9 +174,11 @@ public class CbxReaderService { Path target = targetDir.resolve(fileName); try (InputStream in = zipFile.getInputStream(entry)) { Files.copy(in, target, StandardCopyOption.REPLACE_EXISTING); + foundImages = true; } } } + return foundImages; } } @@ -333,10 +346,19 @@ public class CbxReaderService { String[] encodingsToTry = {"UTF-8", "Shift_JIS", "ISO-8859-1", "CP437", "MS932"}; for (String encoding : encodingsToTry) { + Charset charset = Charset.forName(encoding); try { - return estimateCbzWithEncoding(cbxPath, Charset.forName(encoding)); - } catch (IllegalArgumentException | java.util.zip.ZipException e) { - log.debug("Failed to estimate with encoding {}: {}", encoding, e.getMessage()); + long size = estimateCbzWithEncoding(cbxPath, charset, true); + if (size > 0) return size; + } catch (Exception e) { + log.debug("Fast path estimation failed for encoding {}: {}", encoding, e.getMessage()); + } + + try { + long size = estimateCbzWithEncoding(cbxPath, charset, false); + if (size > 0) return size; + } catch (Exception e) { + log.debug("Slow path estimation failed for encoding {}: {}", encoding, e.getMessage()); } } @@ -344,11 +366,13 @@ public class CbxReaderService { return Long.MAX_VALUE; } - private long estimateCbzWithEncoding(Path cbxPath, Charset charset) throws IOException { + private long estimateCbzWithEncoding(Path cbxPath, Charset charset, boolean useFastPath) throws IOException { try (org.apache.commons.compress.archivers.zip.ZipFile zipFile = org.apache.commons.compress.archivers.zip.ZipFile.builder() .setPath(cbxPath) .setCharset(charset) + .setUseUnicodeExtraFields(true) + .setIgnoreLocalFileHeader(useFastPath) .get()) { long total = 0;