[librsvg/librsvg-2.52: 10/21] PdfPredicate.with_text() - simple way to test for a PDF containing some textual content
- From: Federico Mena Quintero <federico src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [librsvg/librsvg-2.52: 10/21] PdfPredicate.with_text() - simple way to test for a PDF containing some textual content
- Date: Fri, 11 Mar 2022 20:37:22 +0000 (UTC)
commit 44ca3822dd514b55ed9e2aa1f90915739522c56e
Author: Federico Mena Quintero <federico gnome org>
Date: Tue Mar 8 16:45:57 2022 -0600
PdfPredicate.with_text() - simple way to test for a PDF containing some textual content
This is *really* basic; lopdf's Document::extract_text() simply
concatenates all the text strings in a page and returns them, so the
predicate just does String::contains().
If we ever need something more sophisticated, we can walk the PDF
structure manually with lopdf (gulp) and extract individual commands
for text.
Part-of: <https://gitlab.gnome.org/GNOME/librsvg/-/merge_requests/673>
tests/src/predicates/pdf.rs | 29 +++++++++++++++++++++++++++++
1 file changed, 29 insertions(+)
---
diff --git a/tests/src/predicates/pdf.rs b/tests/src/predicates/pdf.rs
index 862e64bb8..62a3da4bb 100644
--- a/tests/src/predicates/pdf.rs
+++ b/tests/src/predicates/pdf.rs
@@ -50,6 +50,13 @@ impl PdfPredicate {
d: Detail::Link(link.to_string()),
}
}
+
+ pub fn with_text(self: Self, text: &str) -> DetailPredicate<Self> {
+ DetailPredicate::<Self> {
+ p: self,
+ d: Detail::Text(text.to_string()),
+ }
+ }
}
impl Predicate<[u8]> for PdfPredicate {
@@ -86,6 +93,7 @@ enum Detail {
PageSize(Dimensions, usize),
CreationDate(DateTime<Utc>),
Link(String),
+ Text(String),
}
/// A PDF page's dimensions from its `MediaBox`.
@@ -160,6 +168,7 @@ impl DetailPredicate<PdfPredicate> {
Detail::PageSize(d, idx) => doc.get_page_size(*idx).map_or(false, |dim| dim == *d),
Detail::CreationDate(d) => doc.get_creation_date().map_or(false, |date| date == *d),
Detail::Link(link) => document_has_link(doc, &link),
+ Detail::Text(text) => document_has_text(doc, &text),
}
}
@@ -193,6 +202,9 @@ impl DetailPredicate<PdfPredicate> {
"actual link contents",
"FIXME: who knows, but it's not what we expected".to_string(),
),
+ Detail::Text(_) => {
+ Product::new("actual text contents", doc.extract_text(&[1]).unwrap())
+ }
}
}
}
@@ -290,10 +302,21 @@ impl fmt::Display for DetailPredicate<PdfPredicate> {
Detail::PageSize(d, _) => write!(f, "is a PDF sized {}", d),
Detail::CreationDate(d) => write!(f, "is a PDF created {:?}", d),
Detail::Link(l) => write!(f, "is a PDF with a link to {}", l),
+ Detail::Text(t) => write!(f, "is a PDF with \"{}\" in its text content", t),
}
}
}
+// This is an extremely trivial test for a string being present in the document's
+// text objects.
+fn document_has_text(document: &lopdf::Document, needle: &str) -> bool {
+ if let Ok(haystack) = text_from_first_page(document) {
+ haystack.contains(needle)
+ } else {
+ false
+ }
+}
+
// We do a super simple test that a PDF actually contains an Annotation object
// with a particular link. We don't test that this annotation is actually linked
// from a page; that would be nicer.
@@ -327,3 +350,9 @@ fn dict_has_a_with_link(dict: &Dictionary, link_text: &str) -> bool {
.map(|string| string == link_text.as_bytes())
.unwrap_or(false)
}
+
+fn text_from_first_page(doc: &lopdf::Document) -> lopdf::Result<String> {
+ // This is extremely simplistic; lopdf just concatenates all the text in the page
+ // into a single string.
+ doc.extract_text(&[1])
+}
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]