mirror of
https://github.com/booklore-app/booklore.git
synced 2026-02-18 00:17:53 +01:00
fix(epub-metadata): enhance cover extraction with heuristic fallbacks for manifest and ZIP entries (#2636)
* fix(epub-metadata): enhance cover extraction with heuristic fallbacks for manifest and ZIP entries Signed-off-by: Balázs Szücs <bszucs1209@gmail.com> * test(epub-metadata-extractor): add heuristic cover creation methods for EPUB files Signed-off-by: Balázs Szücs <bszucs1209@gmail.com> --------- Signed-off-by: Balázs Szücs <bszucs1209@gmail.com>
This commit is contained in:
@@ -104,8 +104,10 @@ public class EpubMetadataExtractor implements FileMetadataExtractor {
|
||||
try {
|
||||
epub = new EpubReader().readEpub(fis);
|
||||
coverImage = epub.getCoverImage();
|
||||
} catch (NullPointerException e) {
|
||||
log.debug("epub4j threw NullPointerException (likely malformed NCX) in {}: {}", epubFile.getName(), e.getMessage());
|
||||
} catch (Exception e) {
|
||||
log.debug("epub4j failed to parse EPUB for cover extraction (will try fallbacks): {}", e.getMessage());
|
||||
log.debug("epub4j failed to parse EPUB for cover extraction (will try fallbacks) in {}: {}", epubFile.getName(), e.getMessage());
|
||||
}
|
||||
|
||||
if (coverImage == null) {
|
||||
@@ -127,6 +129,15 @@ public class EpubMetadataExtractor implements FileMetadataExtractor {
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback: search manifest for anything that looks like a cover
|
||||
if (coverImage == null) {
|
||||
String href = findManifestCoverByHeuristic(epubFile);
|
||||
if (href != null) {
|
||||
byte[] data = extractFileFromZip(epubFile, href);
|
||||
if (data != null) return data;
|
||||
}
|
||||
}
|
||||
|
||||
if (coverImage == null && epub != null) {
|
||||
for (io.documentnode.epub4j.domain.Resource res : epub.getResources().getAll()) {
|
||||
String id = res.getId();
|
||||
@@ -141,6 +152,12 @@ public class EpubMetadataExtractor implements FileMetadataExtractor {
|
||||
}
|
||||
}
|
||||
|
||||
// Absolute last resort: Scan ZIP entries directly
|
||||
if (coverImage == null) {
|
||||
byte[] data = findCoverInZipByHeuristic(epubFile);
|
||||
if (data != null) return data;
|
||||
}
|
||||
|
||||
return (coverImage != null) ? coverImage.getData() : null;
|
||||
} catch (Exception e) {
|
||||
log.warn("Failed to extract cover from EPUB: {}", epubFile.getName(), e);
|
||||
@@ -148,6 +165,69 @@ public class EpubMetadataExtractor implements FileMetadataExtractor {
|
||||
}
|
||||
}
|
||||
|
||||
private String findManifestCoverByHeuristic(File epubFile) {
|
||||
try (ZipFile zip = new ZipFile(epubFile)) {
|
||||
DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
|
||||
dbf.setNamespaceAware(true);
|
||||
dbf.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true);
|
||||
DocumentBuilder builder = dbf.newDocumentBuilder();
|
||||
|
||||
FileHeader containerHdr = zip.getFileHeader("META-INF/container.xml");
|
||||
if (containerHdr == null) return null;
|
||||
|
||||
try (InputStream cis = zip.getInputStream(containerHdr)) {
|
||||
Document containerDoc = builder.parse(cis);
|
||||
NodeList roots = containerDoc.getElementsByTagName("rootfile");
|
||||
if (roots.getLength() == 0) return null;
|
||||
|
||||
String opfPath = ((Element) roots.item(0)).getAttribute("full-path");
|
||||
if (StringUtils.isBlank(opfPath)) return null;
|
||||
|
||||
FileHeader opfHdr = zip.getFileHeader(opfPath);
|
||||
if (opfHdr == null) return null;
|
||||
|
||||
try (InputStream in = zip.getInputStream(opfHdr)) {
|
||||
Document doc = builder.parse(in);
|
||||
NodeList manifestItems = doc.getElementsByTagName("item");
|
||||
|
||||
for (int i = 0; i < manifestItems.getLength(); i++) {
|
||||
Element item = (Element) manifestItems.item(i);
|
||||
String id = item.getAttribute("id");
|
||||
String href = item.getAttribute("href");
|
||||
String mediaType = item.getAttribute("media-type");
|
||||
|
||||
if ((id != null && id.toLowerCase().contains("cover")) || (href != null && href.toLowerCase().contains("cover"))) {
|
||||
if (mediaType != null && mediaType.startsWith("image/")) {
|
||||
String decodedHref = URLDecoder.decode(href, StandardCharsets.UTF_8);
|
||||
return resolvePath(opfPath, decodedHref);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
log.debug("Heuristic manifest search failed: {}", e.getMessage());
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
private byte[] findCoverInZipByHeuristic(File epubFile) {
|
||||
try (ZipFile zip = new ZipFile(epubFile)) {
|
||||
List<FileHeader> fileHeaders = zip.getFileHeaders();
|
||||
for (FileHeader header : fileHeaders) {
|
||||
String name = header.getFileName().toLowerCase();
|
||||
if (name.contains("cover") && (name.endsWith(".jpg") || name.endsWith(".jpeg") || name.endsWith(".png") || name.endsWith(".webp"))) {
|
||||
try (InputStream is = zip.getInputStream(header)) {
|
||||
return is.readAllBytes();
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
log.debug("Direct ZIP scan for cover failed: {}", e.getMessage());
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public BookMetadata extractMetadata(File epubFile) {
|
||||
try (ZipFile zip = new ZipFile(epubFile)) {
|
||||
|
||||
@@ -465,6 +465,30 @@ class EpubMetadataExtractorTest {
|
||||
assertTrue(cover.length > 0);
|
||||
assertArrayEquals(pngImage, cover);
|
||||
}
|
||||
|
||||
@Test
|
||||
@DisplayName("Should extract cover using manifest heuristic fallback (href containing 'cover')")
|
||||
void extractCover_manifestHeuristic_returnsCoverBytes() throws IOException {
|
||||
byte[] pngImage = createMinimalPngImage();
|
||||
File epubFile = createEpubWithHeuristicManifestCover(pngImage, "some-id", "some-cover-file.png");
|
||||
|
||||
byte[] cover = extractor.extractCover(epubFile);
|
||||
|
||||
assertNotNull(cover, "Cover should be extracted via manifest heuristic");
|
||||
assertArrayEquals(pngImage, cover);
|
||||
}
|
||||
|
||||
@Test
|
||||
@DisplayName("Should extract cover using ZIP heuristic fallback (ZIP entry containing 'cover')")
|
||||
void extractCover_zipHeuristic_returnsCoverBytes() throws IOException {
|
||||
byte[] pngImage = createMinimalPngImage();
|
||||
File epubFile = createEpubWithHeuristicZipCover(pngImage, "OEBPS/my-cool-cover.jpg");
|
||||
|
||||
byte[] cover = extractor.extractCover(epubFile);
|
||||
|
||||
assertNotNull(cover, "Cover should be extracted via ZIP heuristic");
|
||||
assertArrayEquals(pngImage, cover);
|
||||
}
|
||||
}
|
||||
|
||||
@Nested
|
||||
@@ -870,5 +894,92 @@ class EpubMetadataExtractorTest {
|
||||
|
||||
return epubFile;
|
||||
}
|
||||
|
||||
private File createEpubWithHeuristicManifestCover(byte[] coverImageData, String id, String href) throws IOException {
|
||||
String opfContent = String.format("""
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<package xmlns="http://www.idpf.org/2007/opf" version="3.0">
|
||||
<metadata xmlns:dc="http://purl.org/dc/elements/1.1/">
|
||||
<dc:title>Book with Heuristic Manifest Cover</dc:title>
|
||||
</metadata>
|
||||
<manifest>
|
||||
<item id="%s" href="%s" media-type="image/png"/>
|
||||
</manifest>
|
||||
</package>
|
||||
""", id, href);
|
||||
|
||||
File epubFile = tempDir.resolve("test-heuristic-manifest-" + System.nanoTime() + ".epub").toFile();
|
||||
|
||||
String containerXml = """
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
|
||||
<rootfiles>
|
||||
<rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/>
|
||||
</rootfiles>
|
||||
</container>
|
||||
""";
|
||||
|
||||
try (ZipOutputStream zos = new ZipOutputStream(new FileOutputStream(epubFile))) {
|
||||
zos.putNextEntry(new ZipEntry("mimetype"));
|
||||
zos.write("application/epub+zip".getBytes(StandardCharsets.UTF_8));
|
||||
zos.closeEntry();
|
||||
|
||||
zos.putNextEntry(new ZipEntry("META-INF/container.xml"));
|
||||
zos.write(containerXml.getBytes(StandardCharsets.UTF_8));
|
||||
zos.closeEntry();
|
||||
|
||||
zos.putNextEntry(new ZipEntry("OEBPS/content.opf"));
|
||||
zos.write(opfContent.getBytes(StandardCharsets.UTF_8));
|
||||
zos.closeEntry();
|
||||
|
||||
zos.putNextEntry(new ZipEntry("OEBPS/" + href));
|
||||
zos.write(coverImageData);
|
||||
zos.closeEntry();
|
||||
}
|
||||
|
||||
return epubFile;
|
||||
}
|
||||
|
||||
private File createEpubWithHeuristicZipCover(byte[] coverImageData, String path) throws IOException {
|
||||
File epubFile = tempDir.resolve("test-heuristic-zip-" + System.nanoTime() + ".epub").toFile();
|
||||
|
||||
String containerXml = """
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
|
||||
<rootfiles>
|
||||
<rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/>
|
||||
</rootfiles>
|
||||
</container>
|
||||
""";
|
||||
|
||||
String opfContent = """
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<package xmlns="http://www.idpf.org/2007/opf" version="3.0">
|
||||
<metadata xmlns:dc="http://purl.org/dc/elements/1.1/">
|
||||
<dc:title>Book with Heuristic ZIP Cover</dc:title>
|
||||
</metadata>
|
||||
</package>
|
||||
""";
|
||||
|
||||
try (ZipOutputStream zos = new ZipOutputStream(new FileOutputStream(epubFile))) {
|
||||
zos.putNextEntry(new ZipEntry("mimetype"));
|
||||
zos.write("application/epub+zip".getBytes(StandardCharsets.UTF_8));
|
||||
zos.closeEntry();
|
||||
|
||||
zos.putNextEntry(new ZipEntry("META-INF/container.xml"));
|
||||
zos.write(containerXml.getBytes(StandardCharsets.UTF_8));
|
||||
zos.closeEntry();
|
||||
|
||||
zos.putNextEntry(new ZipEntry("OEBPS/content.opf"));
|
||||
zos.write(opfContent.getBytes(StandardCharsets.UTF_8));
|
||||
zos.closeEntry();
|
||||
|
||||
zos.putNextEntry(new ZipEntry(path));
|
||||
zos.write(coverImageData);
|
||||
zos.closeEntry();
|
||||
}
|
||||
|
||||
return epubFile;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user