[librsvg: 3/14] PdfPredicate.with_text() - simple way to test for a PDF containing some textual content




commit 8dc81944708e92aff573f9badb708d7de421b721
Author: Federico Mena Quintero <federico gnome org>
Date:   Tue Mar 8 16:45:57 2022 -0600

    PdfPredicate.with_text() - simple way to test for a PDF containing some textual content
    
    This is *really* basic; lopdf's Document::extract_text() simply
    concatenates all the text strings in a page and returns them, so the
    predicate just does String::contains().
    
    If we ever need something more sophisticated, we can walk the PDF
    structure manually with lopdf (gulp) and extract individual commands
    for text.
    
    Part-of: <https://gitlab.gnome.org/GNOME/librsvg/-/merge_requests/673>

 tests/src/predicates/pdf.rs | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)
---
diff --git a/tests/src/predicates/pdf.rs b/tests/src/predicates/pdf.rs
index 862e64bb8..62a3da4bb 100644
--- a/tests/src/predicates/pdf.rs
+++ b/tests/src/predicates/pdf.rs
@@ -50,6 +50,13 @@ impl PdfPredicate {
             d: Detail::Link(link.to_string()),
         }
     }
+
+    pub fn with_text(self: Self, text: &str) -> DetailPredicate<Self> {
+        DetailPredicate::<Self> {
+            p: self,
+            d: Detail::Text(text.to_string()),
+        }
+    }
 }
 
 impl Predicate<[u8]> for PdfPredicate {
@@ -86,6 +93,7 @@ enum Detail {
     PageSize(Dimensions, usize),
     CreationDate(DateTime<Utc>),
     Link(String),
+    Text(String),
 }
 
 /// A PDF page's dimensions from its `MediaBox`.
@@ -160,6 +168,7 @@ impl DetailPredicate<PdfPredicate> {
             Detail::PageSize(d, idx) => doc.get_page_size(*idx).map_or(false, |dim| dim == *d),
             Detail::CreationDate(d) => doc.get_creation_date().map_or(false, |date| date == *d),
             Detail::Link(link) => document_has_link(doc, &link),
+            Detail::Text(text) => document_has_text(doc, &text),
         }
     }
 
@@ -193,6 +202,9 @@ impl DetailPredicate<PdfPredicate> {
                 "actual link contents",
                 "FIXME: who knows, but it's not what we expected".to_string(),
             ),
+            Detail::Text(_) => {
+                Product::new("actual text contents", doc.extract_text(&[1]).unwrap())
+            }
         }
     }
 }
@@ -290,10 +302,21 @@ impl fmt::Display for DetailPredicate<PdfPredicate> {
             Detail::PageSize(d, _) => write!(f, "is a PDF sized {}", d),
             Detail::CreationDate(d) => write!(f, "is a PDF created {:?}", d),
             Detail::Link(l) => write!(f, "is a PDF with a link to {}", l),
+            Detail::Text(t) => write!(f, "is a PDF with \"{}\" in its text content", t),
         }
     }
 }
 
+// This is an extremely trivial test for a string being present in the document's
+// text objects.
+fn document_has_text(document: &lopdf::Document, needle: &str) -> bool {
+    if let Ok(haystack) = text_from_first_page(document) {
+        haystack.contains(needle)
+    } else {
+        false
+    }
+}
+
 // We do a super simple test that a PDF actually contains an Annotation object
 // with a particular link.  We don't test that this annotation is actually linked
 // from a page; that would be nicer.
@@ -327,3 +350,9 @@ fn dict_has_a_with_link(dict: &Dictionary, link_text: &str) -> bool {
         .map(|string| string == link_text.as_bytes())
         .unwrap_or(false)
 }
+
+fn text_from_first_page(doc: &lopdf::Document) -> lopdf::Result<String> {
+    // This is extremely simplistic; lopdf just concatenates all the text in the page
+    // into a single string.
+    doc.extract_text(&[1])
+}


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]