mirror of
https://github.com/booklore-app/booklore.git
synced 2026-02-18 00:17:53 +01:00
fix(epub-metadata): enhance cover extraction with heuristic fallbacks for manifest and ZIP entries (#2636)
* fix(epub-metadata): enhance cover extraction with heuristic fallbacks for manifest and ZIP entries Signed-off-by: Balázs Szücs <bszucs1209@gmail.com> * test(epub-metadata-extractor): add heuristic cover creation methods for EPUB files Signed-off-by: Balázs Szücs <bszucs1209@gmail.com> --------- Signed-off-by: Balázs Szücs <bszucs1209@gmail.com>
This commit is contained in:
@@ -104,8 +104,10 @@ public class EpubMetadataExtractor implements FileMetadataExtractor {
|
|||||||
try {
|
try {
|
||||||
epub = new EpubReader().readEpub(fis);
|
epub = new EpubReader().readEpub(fis);
|
||||||
coverImage = epub.getCoverImage();
|
coverImage = epub.getCoverImage();
|
||||||
|
} catch (NullPointerException e) {
|
||||||
|
log.debug("epub4j threw NullPointerException (likely malformed NCX) in {}: {}", epubFile.getName(), e.getMessage());
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
log.debug("epub4j failed to parse EPUB for cover extraction (will try fallbacks): {}", e.getMessage());
|
log.debug("epub4j failed to parse EPUB for cover extraction (will try fallbacks) in {}: {}", epubFile.getName(), e.getMessage());
|
||||||
}
|
}
|
||||||
|
|
||||||
if (coverImage == null) {
|
if (coverImage == null) {
|
||||||
@@ -127,6 +129,15 @@ public class EpubMetadataExtractor implements FileMetadataExtractor {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Fallback: search manifest for anything that looks like a cover
|
||||||
|
if (coverImage == null) {
|
||||||
|
String href = findManifestCoverByHeuristic(epubFile);
|
||||||
|
if (href != null) {
|
||||||
|
byte[] data = extractFileFromZip(epubFile, href);
|
||||||
|
if (data != null) return data;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (coverImage == null && epub != null) {
|
if (coverImage == null && epub != null) {
|
||||||
for (io.documentnode.epub4j.domain.Resource res : epub.getResources().getAll()) {
|
for (io.documentnode.epub4j.domain.Resource res : epub.getResources().getAll()) {
|
||||||
String id = res.getId();
|
String id = res.getId();
|
||||||
@@ -141,6 +152,12 @@ public class EpubMetadataExtractor implements FileMetadataExtractor {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Absolute last resort: Scan ZIP entries directly
|
||||||
|
if (coverImage == null) {
|
||||||
|
byte[] data = findCoverInZipByHeuristic(epubFile);
|
||||||
|
if (data != null) return data;
|
||||||
|
}
|
||||||
|
|
||||||
return (coverImage != null) ? coverImage.getData() : null;
|
return (coverImage != null) ? coverImage.getData() : null;
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
log.warn("Failed to extract cover from EPUB: {}", epubFile.getName(), e);
|
log.warn("Failed to extract cover from EPUB: {}", epubFile.getName(), e);
|
||||||
@@ -148,6 +165,69 @@ public class EpubMetadataExtractor implements FileMetadataExtractor {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private String findManifestCoverByHeuristic(File epubFile) {
|
||||||
|
try (ZipFile zip = new ZipFile(epubFile)) {
|
||||||
|
DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
|
||||||
|
dbf.setNamespaceAware(true);
|
||||||
|
dbf.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true);
|
||||||
|
DocumentBuilder builder = dbf.newDocumentBuilder();
|
||||||
|
|
||||||
|
FileHeader containerHdr = zip.getFileHeader("META-INF/container.xml");
|
||||||
|
if (containerHdr == null) return null;
|
||||||
|
|
||||||
|
try (InputStream cis = zip.getInputStream(containerHdr)) {
|
||||||
|
Document containerDoc = builder.parse(cis);
|
||||||
|
NodeList roots = containerDoc.getElementsByTagName("rootfile");
|
||||||
|
if (roots.getLength() == 0) return null;
|
||||||
|
|
||||||
|
String opfPath = ((Element) roots.item(0)).getAttribute("full-path");
|
||||||
|
if (StringUtils.isBlank(opfPath)) return null;
|
||||||
|
|
||||||
|
FileHeader opfHdr = zip.getFileHeader(opfPath);
|
||||||
|
if (opfHdr == null) return null;
|
||||||
|
|
||||||
|
try (InputStream in = zip.getInputStream(opfHdr)) {
|
||||||
|
Document doc = builder.parse(in);
|
||||||
|
NodeList manifestItems = doc.getElementsByTagName("item");
|
||||||
|
|
||||||
|
for (int i = 0; i < manifestItems.getLength(); i++) {
|
||||||
|
Element item = (Element) manifestItems.item(i);
|
||||||
|
String id = item.getAttribute("id");
|
||||||
|
String href = item.getAttribute("href");
|
||||||
|
String mediaType = item.getAttribute("media-type");
|
||||||
|
|
||||||
|
if ((id != null && id.toLowerCase().contains("cover")) || (href != null && href.toLowerCase().contains("cover"))) {
|
||||||
|
if (mediaType != null && mediaType.startsWith("image/")) {
|
||||||
|
String decodedHref = URLDecoder.decode(href, StandardCharsets.UTF_8);
|
||||||
|
return resolvePath(opfPath, decodedHref);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (Exception e) {
|
||||||
|
log.debug("Heuristic manifest search failed: {}", e.getMessage());
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
private byte[] findCoverInZipByHeuristic(File epubFile) {
|
||||||
|
try (ZipFile zip = new ZipFile(epubFile)) {
|
||||||
|
List<FileHeader> fileHeaders = zip.getFileHeaders();
|
||||||
|
for (FileHeader header : fileHeaders) {
|
||||||
|
String name = header.getFileName().toLowerCase();
|
||||||
|
if (name.contains("cover") && (name.endsWith(".jpg") || name.endsWith(".jpeg") || name.endsWith(".png") || name.endsWith(".webp"))) {
|
||||||
|
try (InputStream is = zip.getInputStream(header)) {
|
||||||
|
return is.readAllBytes();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (Exception e) {
|
||||||
|
log.debug("Direct ZIP scan for cover failed: {}", e.getMessage());
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public BookMetadata extractMetadata(File epubFile) {
|
public BookMetadata extractMetadata(File epubFile) {
|
||||||
try (ZipFile zip = new ZipFile(epubFile)) {
|
try (ZipFile zip = new ZipFile(epubFile)) {
|
||||||
|
|||||||
@@ -465,6 +465,30 @@ class EpubMetadataExtractorTest {
|
|||||||
assertTrue(cover.length > 0);
|
assertTrue(cover.length > 0);
|
||||||
assertArrayEquals(pngImage, cover);
|
assertArrayEquals(pngImage, cover);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@DisplayName("Should extract cover using manifest heuristic fallback (href containing 'cover')")
|
||||||
|
void extractCover_manifestHeuristic_returnsCoverBytes() throws IOException {
|
||||||
|
byte[] pngImage = createMinimalPngImage();
|
||||||
|
File epubFile = createEpubWithHeuristicManifestCover(pngImage, "some-id", "some-cover-file.png");
|
||||||
|
|
||||||
|
byte[] cover = extractor.extractCover(epubFile);
|
||||||
|
|
||||||
|
assertNotNull(cover, "Cover should be extracted via manifest heuristic");
|
||||||
|
assertArrayEquals(pngImage, cover);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@DisplayName("Should extract cover using ZIP heuristic fallback (ZIP entry containing 'cover')")
|
||||||
|
void extractCover_zipHeuristic_returnsCoverBytes() throws IOException {
|
||||||
|
byte[] pngImage = createMinimalPngImage();
|
||||||
|
File epubFile = createEpubWithHeuristicZipCover(pngImage, "OEBPS/my-cool-cover.jpg");
|
||||||
|
|
||||||
|
byte[] cover = extractor.extractCover(epubFile);
|
||||||
|
|
||||||
|
assertNotNull(cover, "Cover should be extracted via ZIP heuristic");
|
||||||
|
assertArrayEquals(pngImage, cover);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Nested
|
@Nested
|
||||||
@@ -870,5 +894,92 @@ class EpubMetadataExtractorTest {
|
|||||||
|
|
||||||
return epubFile;
|
return epubFile;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private File createEpubWithHeuristicManifestCover(byte[] coverImageData, String id, String href) throws IOException {
|
||||||
|
String opfContent = String.format("""
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<package xmlns="http://www.idpf.org/2007/opf" version="3.0">
|
||||||
|
<metadata xmlns:dc="http://purl.org/dc/elements/1.1/">
|
||||||
|
<dc:title>Book with Heuristic Manifest Cover</dc:title>
|
||||||
|
</metadata>
|
||||||
|
<manifest>
|
||||||
|
<item id="%s" href="%s" media-type="image/png"/>
|
||||||
|
</manifest>
|
||||||
|
</package>
|
||||||
|
""", id, href);
|
||||||
|
|
||||||
|
File epubFile = tempDir.resolve("test-heuristic-manifest-" + System.nanoTime() + ".epub").toFile();
|
||||||
|
|
||||||
|
String containerXml = """
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
|
||||||
|
<rootfiles>
|
||||||
|
<rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/>
|
||||||
|
</rootfiles>
|
||||||
|
</container>
|
||||||
|
""";
|
||||||
|
|
||||||
|
try (ZipOutputStream zos = new ZipOutputStream(new FileOutputStream(epubFile))) {
|
||||||
|
zos.putNextEntry(new ZipEntry("mimetype"));
|
||||||
|
zos.write("application/epub+zip".getBytes(StandardCharsets.UTF_8));
|
||||||
|
zos.closeEntry();
|
||||||
|
|
||||||
|
zos.putNextEntry(new ZipEntry("META-INF/container.xml"));
|
||||||
|
zos.write(containerXml.getBytes(StandardCharsets.UTF_8));
|
||||||
|
zos.closeEntry();
|
||||||
|
|
||||||
|
zos.putNextEntry(new ZipEntry("OEBPS/content.opf"));
|
||||||
|
zos.write(opfContent.getBytes(StandardCharsets.UTF_8));
|
||||||
|
zos.closeEntry();
|
||||||
|
|
||||||
|
zos.putNextEntry(new ZipEntry("OEBPS/" + href));
|
||||||
|
zos.write(coverImageData);
|
||||||
|
zos.closeEntry();
|
||||||
|
}
|
||||||
|
|
||||||
|
return epubFile;
|
||||||
|
}
|
||||||
|
|
||||||
|
private File createEpubWithHeuristicZipCover(byte[] coverImageData, String path) throws IOException {
|
||||||
|
File epubFile = tempDir.resolve("test-heuristic-zip-" + System.nanoTime() + ".epub").toFile();
|
||||||
|
|
||||||
|
String containerXml = """
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
|
||||||
|
<rootfiles>
|
||||||
|
<rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/>
|
||||||
|
</rootfiles>
|
||||||
|
</container>
|
||||||
|
""";
|
||||||
|
|
||||||
|
String opfContent = """
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<package xmlns="http://www.idpf.org/2007/opf" version="3.0">
|
||||||
|
<metadata xmlns:dc="http://purl.org/dc/elements/1.1/">
|
||||||
|
<dc:title>Book with Heuristic ZIP Cover</dc:title>
|
||||||
|
</metadata>
|
||||||
|
</package>
|
||||||
|
""";
|
||||||
|
|
||||||
|
try (ZipOutputStream zos = new ZipOutputStream(new FileOutputStream(epubFile))) {
|
||||||
|
zos.putNextEntry(new ZipEntry("mimetype"));
|
||||||
|
zos.write("application/epub+zip".getBytes(StandardCharsets.UTF_8));
|
||||||
|
zos.closeEntry();
|
||||||
|
|
||||||
|
zos.putNextEntry(new ZipEntry("META-INF/container.xml"));
|
||||||
|
zos.write(containerXml.getBytes(StandardCharsets.UTF_8));
|
||||||
|
zos.closeEntry();
|
||||||
|
|
||||||
|
zos.putNextEntry(new ZipEntry("OEBPS/content.opf"));
|
||||||
|
zos.write(opfContent.getBytes(StandardCharsets.UTF_8));
|
||||||
|
zos.closeEntry();
|
||||||
|
|
||||||
|
zos.putNextEntry(new ZipEntry(path));
|
||||||
|
zos.write(coverImageData);
|
||||||
|
zos.closeEntry();
|
||||||
|
}
|
||||||
|
|
||||||
|
return epubFile;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user