fix(epub-metadata): enhance cover extraction with heuristic fallbacks for manifest and ZIP entries (#2636)

* fix(epub-metadata): enhance cover extraction with heuristic fallbacks for manifest and ZIP entries

Signed-off-by: Balázs Szücs <bszucs1209@gmail.com>

* test(epub-metadata-extractor): add heuristic cover creation methods for EPUB files

Signed-off-by: Balázs Szücs <bszucs1209@gmail.com>

---------

Signed-off-by: Balázs Szücs <bszucs1209@gmail.com>
This commit is contained in:
Balázs Szücs
2026-02-10 00:46:50 +01:00
committed by GitHub
parent ffd4615b87
commit 45dba44833
2 changed files with 192 additions and 1 deletions

View File

@@ -104,8 +104,10 @@ public class EpubMetadataExtractor implements FileMetadataExtractor {
try {
epub = new EpubReader().readEpub(fis);
coverImage = epub.getCoverImage();
} catch (NullPointerException e) {
log.debug("epub4j threw NullPointerException (likely malformed NCX) in {}: {}", epubFile.getName(), e.getMessage());
} catch (Exception e) {
log.debug("epub4j failed to parse EPUB for cover extraction (will try fallbacks): {}", e.getMessage());
log.debug("epub4j failed to parse EPUB for cover extraction (will try fallbacks) in {}: {}", epubFile.getName(), e.getMessage());
}
if (coverImage == null) {
@@ -127,6 +129,15 @@ public class EpubMetadataExtractor implements FileMetadataExtractor {
}
}
// Fallback: search manifest for anything that looks like a cover
if (coverImage == null) {
String href = findManifestCoverByHeuristic(epubFile);
if (href != null) {
byte[] data = extractFileFromZip(epubFile, href);
if (data != null) return data;
}
}
if (coverImage == null && epub != null) {
for (io.documentnode.epub4j.domain.Resource res : epub.getResources().getAll()) {
String id = res.getId();
@@ -141,6 +152,12 @@ public class EpubMetadataExtractor implements FileMetadataExtractor {
}
}
// Absolute last resort: Scan ZIP entries directly
if (coverImage == null) {
byte[] data = findCoverInZipByHeuristic(epubFile);
if (data != null) return data;
}
return (coverImage != null) ? coverImage.getData() : null;
} catch (Exception e) {
log.warn("Failed to extract cover from EPUB: {}", epubFile.getName(), e);
@@ -148,6 +165,69 @@ public class EpubMetadataExtractor implements FileMetadataExtractor {
}
}
private String findManifestCoverByHeuristic(File epubFile) {
try (ZipFile zip = new ZipFile(epubFile)) {
DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
dbf.setNamespaceAware(true);
dbf.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true);
DocumentBuilder builder = dbf.newDocumentBuilder();
FileHeader containerHdr = zip.getFileHeader("META-INF/container.xml");
if (containerHdr == null) return null;
try (InputStream cis = zip.getInputStream(containerHdr)) {
Document containerDoc = builder.parse(cis);
NodeList roots = containerDoc.getElementsByTagName("rootfile");
if (roots.getLength() == 0) return null;
String opfPath = ((Element) roots.item(0)).getAttribute("full-path");
if (StringUtils.isBlank(opfPath)) return null;
FileHeader opfHdr = zip.getFileHeader(opfPath);
if (opfHdr == null) return null;
try (InputStream in = zip.getInputStream(opfHdr)) {
Document doc = builder.parse(in);
NodeList manifestItems = doc.getElementsByTagName("item");
for (int i = 0; i < manifestItems.getLength(); i++) {
Element item = (Element) manifestItems.item(i);
String id = item.getAttribute("id");
String href = item.getAttribute("href");
String mediaType = item.getAttribute("media-type");
if ((id != null && id.toLowerCase().contains("cover")) || (href != null && href.toLowerCase().contains("cover"))) {
if (mediaType != null && mediaType.startsWith("image/")) {
String decodedHref = URLDecoder.decode(href, StandardCharsets.UTF_8);
return resolvePath(opfPath, decodedHref);
}
}
}
}
}
} catch (Exception e) {
log.debug("Heuristic manifest search failed: {}", e.getMessage());
}
return null;
}
private byte[] findCoverInZipByHeuristic(File epubFile) {
try (ZipFile zip = new ZipFile(epubFile)) {
List<FileHeader> fileHeaders = zip.getFileHeaders();
for (FileHeader header : fileHeaders) {
String name = header.getFileName().toLowerCase();
if (name.contains("cover") && (name.endsWith(".jpg") || name.endsWith(".jpeg") || name.endsWith(".png") || name.endsWith(".webp"))) {
try (InputStream is = zip.getInputStream(header)) {
return is.readAllBytes();
}
}
}
} catch (Exception e) {
log.debug("Direct ZIP scan for cover failed: {}", e.getMessage());
}
return null;
}
@Override
public BookMetadata extractMetadata(File epubFile) {
try (ZipFile zip = new ZipFile(epubFile)) {

View File

@@ -465,6 +465,30 @@ class EpubMetadataExtractorTest {
assertTrue(cover.length > 0);
assertArrayEquals(pngImage, cover);
}
@Test
@DisplayName("Should extract cover using manifest heuristic fallback (href containing 'cover')")
void extractCover_manifestHeuristic_returnsCoverBytes() throws IOException {
byte[] pngImage = createMinimalPngImage();
File epubFile = createEpubWithHeuristicManifestCover(pngImage, "some-id", "some-cover-file.png");
byte[] cover = extractor.extractCover(epubFile);
assertNotNull(cover, "Cover should be extracted via manifest heuristic");
assertArrayEquals(pngImage, cover);
}
@Test
@DisplayName("Should extract cover using ZIP heuristic fallback (ZIP entry containing 'cover')")
void extractCover_zipHeuristic_returnsCoverBytes() throws IOException {
byte[] pngImage = createMinimalPngImage();
File epubFile = createEpubWithHeuristicZipCover(pngImage, "OEBPS/my-cool-cover.jpg");
byte[] cover = extractor.extractCover(epubFile);
assertNotNull(cover, "Cover should be extracted via ZIP heuristic");
assertArrayEquals(pngImage, cover);
}
}
@Nested
@@ -870,5 +894,92 @@ class EpubMetadataExtractorTest {
return epubFile;
}
private File createEpubWithHeuristicManifestCover(byte[] coverImageData, String id, String href) throws IOException {
String opfContent = String.format("""
<?xml version="1.0" encoding="UTF-8"?>
<package xmlns="http://www.idpf.org/2007/opf" version="3.0">
<metadata xmlns:dc="http://purl.org/dc/elements/1.1/">
<dc:title>Book with Heuristic Manifest Cover</dc:title>
</metadata>
<manifest>
<item id="%s" href="%s" media-type="image/png"/>
</manifest>
</package>
""", id, href);
File epubFile = tempDir.resolve("test-heuristic-manifest-" + System.nanoTime() + ".epub").toFile();
String containerXml = """
<?xml version="1.0" encoding="UTF-8"?>
<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
<rootfiles>
<rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/>
</rootfiles>
</container>
""";
try (ZipOutputStream zos = new ZipOutputStream(new FileOutputStream(epubFile))) {
zos.putNextEntry(new ZipEntry("mimetype"));
zos.write("application/epub+zip".getBytes(StandardCharsets.UTF_8));
zos.closeEntry();
zos.putNextEntry(new ZipEntry("META-INF/container.xml"));
zos.write(containerXml.getBytes(StandardCharsets.UTF_8));
zos.closeEntry();
zos.putNextEntry(new ZipEntry("OEBPS/content.opf"));
zos.write(opfContent.getBytes(StandardCharsets.UTF_8));
zos.closeEntry();
zos.putNextEntry(new ZipEntry("OEBPS/" + href));
zos.write(coverImageData);
zos.closeEntry();
}
return epubFile;
}
private File createEpubWithHeuristicZipCover(byte[] coverImageData, String path) throws IOException {
File epubFile = tempDir.resolve("test-heuristic-zip-" + System.nanoTime() + ".epub").toFile();
String containerXml = """
<?xml version="1.0" encoding="UTF-8"?>
<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
<rootfiles>
<rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/>
</rootfiles>
</container>
""";
String opfContent = """
<?xml version="1.0" encoding="UTF-8"?>
<package xmlns="http://www.idpf.org/2007/opf" version="3.0">
<metadata xmlns:dc="http://purl.org/dc/elements/1.1/">
<dc:title>Book with Heuristic ZIP Cover</dc:title>
</metadata>
</package>
""";
try (ZipOutputStream zos = new ZipOutputStream(new FileOutputStream(epubFile))) {
zos.putNextEntry(new ZipEntry("mimetype"));
zos.write("application/epub+zip".getBytes(StandardCharsets.UTF_8));
zos.closeEntry();
zos.putNextEntry(new ZipEntry("META-INF/container.xml"));
zos.write(containerXml.getBytes(StandardCharsets.UTF_8));
zos.closeEntry();
zos.putNextEntry(new ZipEntry("OEBPS/content.opf"));
zos.write(opfContent.getBytes(StandardCharsets.UTF_8));
zos.closeEntry();
zos.putNextEntry(new ZipEntry(path));
zos.write(coverImageData);
zos.closeEntry();
}
return epubFile;
}
}