perf(cbz): optimize CBZ reading (#1980)

* perf(cbz): improve zip file compatibility by enabling Unicode extra fields and ignoring local file headers

Signed-off-by: Balázs Szücs <bszucs1209@gmail.com>

* feat(perf): Implement robust ZipFile optimization with fallback

Updated ZipFile reading logic in CbxReaderService, CbxProcessor, and CbxConversionService. Implemented a 'Fast Path' (Central Directory only) for speed (20x faster on large archives). Implemented a 'Slow Path' (Local Header scanning) fallback if the fast path fails or finds no images. Ensures compatibility with older or malformed archives where Unicode names are only present in local headers. Refactored extraction logic to helper methods where appropriate to support retry mechanism.

---------

Signed-off-by: Balázs Szücs <bszucs1209@gmail.com>
This commit is contained in:
Balázs Szücs
2025-12-25 18:10:14 +01:00
committed by GitHub
parent f5e99d52c1
commit 21bcc7d382
3 changed files with 98 additions and 32 deletions

View File

@@ -114,24 +114,45 @@ public class CbxProcessor extends AbstractFileProcessor implements BookFileProce
}
private Optional<BufferedImage> extractFirstImageFromZip(File file) {
try (ZipFile zipFile = ZipFile.builder().setFile(file).get()) {
return Collections.list(zipFile.getEntries()).stream()
.filter(e -> !e.isDirectory() && IMAGE_EXTENSION_CASE_INSENSITIVE_PATTERN.matcher(e.getName()).matches())
.min(Comparator.comparing(ZipArchiveEntry::getName))
.map(entry -> {
try (InputStream is = zipFile.getInputStream(entry)) {
return ImageIO.read(is);
} catch (Exception e) {
log.warn("Failed to read image from ZIP entry {}: {}", entry.getName(), e.getMessage());
return null;
}
});
// Fast path: Try reading from Central Directory
try (ZipFile zipFile = ZipFile.builder()
.setFile(file)
.setUseUnicodeExtraFields(true)
.setIgnoreLocalFileHeader(true)
.get()) {
Optional<BufferedImage> image = findAndReadFirstImage(zipFile);
if (image.isPresent()) return image;
} catch (Exception e) {
log.debug("Fast path failed for ZIP extraction: {}", e.getMessage());
}
// Slow path: Fallback to scanning local file headers
try (ZipFile zipFile = ZipFile.builder()
.setFile(file)
.setUseUnicodeExtraFields(true)
.setIgnoreLocalFileHeader(false)
.get()) {
return findAndReadFirstImage(zipFile);
} catch (Exception e) {
log.error("Error extracting ZIP: {}", e.getMessage());
return Optional.empty();
}
}
private Optional<BufferedImage> findAndReadFirstImage(ZipFile zipFile) {
return Collections.list(zipFile.getEntries()).stream()
.filter(e -> !e.isDirectory() && IMAGE_EXTENSION_CASE_INSENSITIVE_PATTERN.matcher(e.getName()).matches())
.min(Comparator.comparing(ZipArchiveEntry::getName))
.map(entry -> {
try (InputStream is = zipFile.getInputStream(entry)) {
return ImageIO.read(is);
} catch (Exception e) {
log.warn("Failed to read image from ZIP entry {}: {}", entry.getName(), e.getMessage());
return null;
}
});
}
private Optional<BufferedImage> extractFirstImageFrom7z(File file) {
try (SevenZFile sevenZFile = SevenZFile.builder().setFile(file).get()) {
List<SevenZArchiveEntry> imageEntries = new ArrayList<>();

View File

@@ -196,23 +196,44 @@ public class CbxConversionService {
}
private List<Path> extractImagesFromZip(File cbzFile, Path extractedImagesDir) throws IOException {
// Fast path: Try reading from Central Directory
try (ZipFile zipFile = ZipFile.builder()
.setFile(cbzFile)
.setUseUnicodeExtraFields(true)
.setIgnoreLocalFileHeader(true)
.get()) {
List<Path> paths = extractImagesFromZipFile(zipFile, extractedImagesDir);
if (!paths.isEmpty()) return paths;
} catch (Exception e) {
log.debug("Fast path extraction failed for {}: {}", cbzFile.getName(), e.getMessage());
}
// Slow path: Fallback to scanning local file headers
try (ZipFile zipFile = ZipFile.builder()
.setFile(cbzFile)
.setUseUnicodeExtraFields(true)
.setIgnoreLocalFileHeader(false)
.get()) {
return extractImagesFromZipFile(zipFile, extractedImagesDir);
}
}
private List<Path> extractImagesFromZipFile(ZipFile zipFile, Path extractedImagesDir) {
List<Path> imagePaths = new ArrayList<>();
try (ZipFile zipFile = ZipFile.builder().setFile(cbzFile).get()) {
for (ZipArchiveEntry entry : Collections.list(zipFile.getEntries())) {
if (entry.isDirectory() || !isImageFile(entry.getName())) {
continue;
}
for (ZipArchiveEntry entry : Collections.list(zipFile.getEntries())) {
if (entry.isDirectory() || !isImageFile(entry.getName())) {
continue;
}
try {
validateImageSize(entry.getName(), entry.getSize());
Path outputPath = extractedImagesDir.resolve(extractFileName(entry.getName()));
try (InputStream inputStream = zipFile.getInputStream(entry)) {
Path outputPath = extractedImagesDir.resolve(extractFileName(entry.getName()));
Files.copy(inputStream, outputPath);
imagePaths.add(outputPath);
} catch (Exception e) {
log.warn("Error extracting image {}: {}", entry.getName(), e.getMessage());
}
} catch (Exception e) {
log.warn("Error extracting image {}: {}", entry.getName(), e.getMessage());
}
}

View File

@@ -137,25 +137,36 @@ public class CbxReaderService {
String[] encodingsToTry = {"UTF-8", "Shift_JIS", "ISO-8859-1", "CP437", "MS932"};
for (String encoding : encodingsToTry) {
Charset charset = Charset.forName(encoding);
try {
extractZipWithEncoding(cbzPath, targetDir, Charset.forName(encoding));
return;
} catch (IllegalArgumentException | java.util.zip.ZipException e) {
log.debug("Failed to extract with encoding {}: {}", encoding, e.getMessage());
// Fast path: Try reading from Central Directory only
if (extractZipWithEncoding(cbzPath, targetDir, charset, true)) return;
} catch (Exception e) {
log.debug("Fast path failed for encoding {}: {}", encoding, e.getMessage());
}
try {
// Slow path: Fallback to scanning local file headers
if (extractZipWithEncoding(cbzPath, targetDir, charset, false)) return;
} catch (Exception e) {
log.debug("Slow path failed for encoding {}: {}", encoding, e.getMessage());
}
}
throw new IOException("Unable to extract ZIP archive with any supported encoding");
}
private void extractZipWithEncoding(Path cbzPath, Path targetDir, Charset charset) throws IOException {
private boolean extractZipWithEncoding(Path cbzPath, Path targetDir, Charset charset, boolean useFastPath) throws IOException {
try (org.apache.commons.compress.archivers.zip.ZipFile zipFile =
org.apache.commons.compress.archivers.zip.ZipFile.builder()
.setPath(cbzPath)
.setCharset(charset)
.setUseUnicodeExtraFields(true)
.setIgnoreLocalFileHeader(useFastPath)
.get()) {
var entries = zipFile.getEntries();
boolean foundImages = false;
while (entries.hasMoreElements()) {
ZipArchiveEntry entry = entries.nextElement();
if (!entry.isDirectory() && isImageFile(entry.getName())) {
@@ -163,9 +174,11 @@ public class CbxReaderService {
Path target = targetDir.resolve(fileName);
try (InputStream in = zipFile.getInputStream(entry)) {
Files.copy(in, target, StandardCopyOption.REPLACE_EXISTING);
foundImages = true;
}
}
}
return foundImages;
}
}
@@ -333,10 +346,19 @@ public class CbxReaderService {
String[] encodingsToTry = {"UTF-8", "Shift_JIS", "ISO-8859-1", "CP437", "MS932"};
for (String encoding : encodingsToTry) {
Charset charset = Charset.forName(encoding);
try {
return estimateCbzWithEncoding(cbxPath, Charset.forName(encoding));
} catch (IllegalArgumentException | java.util.zip.ZipException e) {
log.debug("Failed to estimate with encoding {}: {}", encoding, e.getMessage());
long size = estimateCbzWithEncoding(cbxPath, charset, true);
if (size > 0) return size;
} catch (Exception e) {
log.debug("Fast path estimation failed for encoding {}: {}", encoding, e.getMessage());
}
try {
long size = estimateCbzWithEncoding(cbxPath, charset, false);
if (size > 0) return size;
} catch (Exception e) {
log.debug("Slow path estimation failed for encoding {}: {}", encoding, e.getMessage());
}
}
@@ -344,11 +366,13 @@ public class CbxReaderService {
return Long.MAX_VALUE;
}
private long estimateCbzWithEncoding(Path cbxPath, Charset charset) throws IOException {
private long estimateCbzWithEncoding(Path cbxPath, Charset charset, boolean useFastPath) throws IOException {
try (org.apache.commons.compress.archivers.zip.ZipFile zipFile =
org.apache.commons.compress.archivers.zip.ZipFile.builder()
.setPath(cbxPath)
.setCharset(charset)
.setUseUnicodeExtraFields(true)
.setIgnoreLocalFileHeader(useFastPath)
.get()) {
long total = 0;