fix(epub-metadata): enhance cover extraction with heuristic fallbacks for manifest and ZIP entries (#2636)

* fix(epub-metadata): enhance cover extraction with heuristic fallbacks for manifest and ZIP entries Signed-off-by: Balázs Szücs <bszucs1209@gmail.com> * test(epub-metadata-extractor): add heuristic cover creation methods for EPUB files Signed-off-by: Balázs Szücs <bszucs1209@gmail.com> --------- Signed-off-by: Balázs Szücs <bszucs1209@gmail.com>
2026-02-18 00:17:53 +01:00 · 2026-02-10 00:46:50 +01:00
parent ffd4615b87
commit 45dba44833
2 changed files with 192 additions and 1 deletions
--- a/booklore-api/src/main/java/org/booklore/service/metadata/extractor/EpubMetadataExtractor.java
+++ b/booklore-api/src/main/java/org/booklore/service/metadata/extractor/EpubMetadataExtractor.java
@@ -104,8 +104,10 @@ public class EpubMetadataExtractor implements FileMetadataExtractor {
            try {
                epub = new EpubReader().readEpub(fis);
                coverImage = epub.getCoverImage();
+            } catch (NullPointerException e) {
+                log.debug("epub4j threw NullPointerException (likely malformed NCX) in {}: {}", epubFile.getName(), e.getMessage());
            } catch (Exception e) {
-                log.debug("epub4j failed to parse EPUB for cover extraction (will try fallbacks): {}", e.getMessage());
+                log.debug("epub4j failed to parse EPUB for cover extraction (will try fallbacks) in {}: {}", epubFile.getName(), e.getMessage());
            }

            if (coverImage == null) {
@@ -127,6 +129,15 @@ public class EpubMetadataExtractor implements FileMetadataExtractor {
                }
            }

+            // Fallback: search manifest for anything that looks like a cover
+            if (coverImage == null) {
+                String href = findManifestCoverByHeuristic(epubFile);
+                if (href != null) {
+                    byte[] data = extractFileFromZip(epubFile, href);
+                    if (data != null) return data;
+                }
+            }
+
            if (coverImage == null && epub != null) {
                for (io.documentnode.epub4j.domain.Resource res : epub.getResources().getAll()) {
                    String id = res.getId();
@@ -141,6 +152,12 @@ public class EpubMetadataExtractor implements FileMetadataExtractor {
                }
            }

+            // Absolute last resort: Scan ZIP entries directly
+            if (coverImage == null) {
+                byte[] data = findCoverInZipByHeuristic(epubFile);
+                if (data != null) return data;
+            }
+
            return (coverImage != null) ? coverImage.getData() : null;
        } catch (Exception e) {
            log.warn("Failed to extract cover from EPUB: {}", epubFile.getName(), e);
@@ -148,6 +165,69 @@ public class EpubMetadataExtractor implements FileMetadataExtractor {
        }
    }

+    private String findManifestCoverByHeuristic(File epubFile) {
+        try (ZipFile zip = new ZipFile(epubFile)) {
+            DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
+            dbf.setNamespaceAware(true);
+            dbf.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true);
+            DocumentBuilder builder = dbf.newDocumentBuilder();
+
+            FileHeader containerHdr = zip.getFileHeader("META-INF/container.xml");
+            if (containerHdr == null) return null;
+
+            try (InputStream cis = zip.getInputStream(containerHdr)) {
+                Document containerDoc = builder.parse(cis);
+                NodeList roots = containerDoc.getElementsByTagName("rootfile");
+                if (roots.getLength() == 0) return null;
+
+                String opfPath = ((Element) roots.item(0)).getAttribute("full-path");
+                if (StringUtils.isBlank(opfPath)) return null;
+
+                FileHeader opfHdr = zip.getFileHeader(opfPath);
+                if (opfHdr == null) return null;
+
+                try (InputStream in = zip.getInputStream(opfHdr)) {
+                    Document doc = builder.parse(in);
+                    NodeList manifestItems = doc.getElementsByTagName("item");
+
+                    for (int i = 0; i < manifestItems.getLength(); i++) {
+                        Element item = (Element) manifestItems.item(i);
+                        String id = item.getAttribute("id");
+                        String href = item.getAttribute("href");
+                        String mediaType = item.getAttribute("media-type");
+
+                        if ((id != null && id.toLowerCase().contains("cover")) || (href != null && href.toLowerCase().contains("cover"))) {
+                            if (mediaType != null && mediaType.startsWith("image/")) {
+                                String decodedHref = URLDecoder.decode(href, StandardCharsets.UTF_8);
+                                return resolvePath(opfPath, decodedHref);
+                            }
+                        }
+                    }
+                }
+            }
+        } catch (Exception e) {
+            log.debug("Heuristic manifest search failed: {}", e.getMessage());
+        }
+        return null;
+    }
+
+    private byte[] findCoverInZipByHeuristic(File epubFile) {
+        try (ZipFile zip = new ZipFile(epubFile)) {
+            List<FileHeader> fileHeaders = zip.getFileHeaders();
+            for (FileHeader header : fileHeaders) {
+                String name = header.getFileName().toLowerCase();
+                if (name.contains("cover") && (name.endsWith(".jpg") || name.endsWith(".jpeg") || name.endsWith(".png") || name.endsWith(".webp"))) {
+                    try (InputStream is = zip.getInputStream(header)) {
+                        return is.readAllBytes();
+                    }
+                }
+            }
+        } catch (Exception e) {
+            log.debug("Direct ZIP scan for cover failed: {}", e.getMessage());
+        }
+        return null;
+    }
+
    @Override
    public BookMetadata extractMetadata(File epubFile) {
        try (ZipFile zip = new ZipFile(epubFile)) {
--- a/booklore-api/src/test/java/org/booklore/service/metadata/extractor/EpubMetadataExtractorTest.java
+++ b/booklore-api/src/test/java/org/booklore/service/metadata/extractor/EpubMetadataExtractorTest.java
@@ -465,6 +465,30 @@ class EpubMetadataExtractorTest {
            assertTrue(cover.length > 0);
            assertArrayEquals(pngImage, cover);
        }
+
+        @Test
+        @DisplayName("Should extract cover using manifest heuristic fallback (href containing 'cover')")
+        void extractCover_manifestHeuristic_returnsCoverBytes() throws IOException {
+            byte[] pngImage = createMinimalPngImage();
+            File epubFile = createEpubWithHeuristicManifestCover(pngImage, "some-id", "some-cover-file.png");
+
+            byte[] cover = extractor.extractCover(epubFile);
+
+            assertNotNull(cover, "Cover should be extracted via manifest heuristic");
+            assertArrayEquals(pngImage, cover);
+        }
+
+        @Test
+        @DisplayName("Should extract cover using ZIP heuristic fallback (ZIP entry containing 'cover')")
+        void extractCover_zipHeuristic_returnsCoverBytes() throws IOException {
+            byte[] pngImage = createMinimalPngImage();
+            File epubFile = createEpubWithHeuristicZipCover(pngImage, "OEBPS/my-cool-cover.jpg");
+
+            byte[] cover = extractor.extractCover(epubFile);
+
+            assertNotNull(cover, "Cover should be extracted via ZIP heuristic");
+            assertArrayEquals(pngImage, cover);
+        }
    }

    @Nested
@@ -870,5 +894,92 @@ class EpubMetadataExtractorTest {

        return epubFile;
    }
+
+    private File createEpubWithHeuristicManifestCover(byte[] coverImageData, String id, String href) throws IOException {
+        String opfContent = String.format("""
+            <?xml version="1.0" encoding="UTF-8"?>
+            <package xmlns="http://www.idpf.org/2007/opf" version="3.0">
+                <metadata xmlns:dc="http://purl.org/dc/elements/1.1/">
+                    <dc:title>Book with Heuristic Manifest Cover</dc:title>
+                </metadata>
+                <manifest>
+                    <item id="%s" href="%s" media-type="image/png"/>
+                </manifest>
+            </package>
+            """, id, href);
+
+        File epubFile = tempDir.resolve("test-heuristic-manifest-" + System.nanoTime() + ".epub").toFile();
+
+        String containerXml = """
+            <?xml version="1.0" encoding="UTF-8"?>
+            <container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
+                <rootfiles>
+                    <rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/>
+                </rootfiles>
+            </container>
+            """;
+
+        try (ZipOutputStream zos = new ZipOutputStream(new FileOutputStream(epubFile))) {
+            zos.putNextEntry(new ZipEntry("mimetype"));
+            zos.write("application/epub+zip".getBytes(StandardCharsets.UTF_8));
+            zos.closeEntry();
+
+            zos.putNextEntry(new ZipEntry("META-INF/container.xml"));
+            zos.write(containerXml.getBytes(StandardCharsets.UTF_8));
+            zos.closeEntry();
+
+            zos.putNextEntry(new ZipEntry("OEBPS/content.opf"));
+            zos.write(opfContent.getBytes(StandardCharsets.UTF_8));
+            zos.closeEntry();
+
+            zos.putNextEntry(new ZipEntry("OEBPS/" + href));
+            zos.write(coverImageData);
+            zos.closeEntry();
+        }
+
+        return epubFile;
+    }
+
+    private File createEpubWithHeuristicZipCover(byte[] coverImageData, String path) throws IOException {
+        File epubFile = tempDir.resolve("test-heuristic-zip-" + System.nanoTime() + ".epub").toFile();
+
+        String containerXml = """
+            <?xml version="1.0" encoding="UTF-8"?>
+            <container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
+                <rootfiles>
+                    <rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/>
+                </rootfiles>
+            </container>
+            """;
+
+        String opfContent = """
+            <?xml version="1.0" encoding="UTF-8"?>
+            <package xmlns="http://www.idpf.org/2007/opf" version="3.0">
+                <metadata xmlns:dc="http://purl.org/dc/elements/1.1/">
+                    <dc:title>Book with Heuristic ZIP Cover</dc:title>
+                </metadata>
+            </package>
+            """;
+
+        try (ZipOutputStream zos = new ZipOutputStream(new FileOutputStream(epubFile))) {
+            zos.putNextEntry(new ZipEntry("mimetype"));
+            zos.write("application/epub+zip".getBytes(StandardCharsets.UTF_8));
+            zos.closeEntry();
+
+            zos.putNextEntry(new ZipEntry("META-INF/container.xml"));
+            zos.write(containerXml.getBytes(StandardCharsets.UTF_8));
+            zos.closeEntry();
+
+            zos.putNextEntry(new ZipEntry("OEBPS/content.opf"));
+            zos.write(opfContent.getBytes(StandardCharsets.UTF_8));
+            zos.closeEntry();
+
+            zos.putNextEntry(new ZipEntry(path));
+            zos.write(coverImageData);
+            zos.closeEntry();
+        }
+
+        return epubFile;
+    }
 }