[orca] Begin addition of support for language identification

From: Joanmarie Diggs <joanied src gnome org>
To: commits-list gnome org
Cc:
Subject: [orca] Begin addition of support for language identification
Date: Wed, 5 Jan 2022 15:29:11 +0000 (UTC)
commit c6cbefcea5cdb4eb3c12c67fa0e55c1eb238e5da
Author: Joanmarie Diggs <jdiggs igalia com>
Date:   Wed Jan 5 16:25:52 2022 +0100

    Begin addition of support for language identification
    
    Please note: This is a work in progress and there should be no end-user
    experienced changes in behavior. It is part of the foundation needed to
    eventually make automatic language switching an optional feature.

 src/orca/script_utilities.py             | 59 ++++++++++++++++++++++++++
 src/orca/scripts/web/script_utilities.py | 73 +++++++++++++++++++++++++++++++-
 2 files changed, 130 insertions(+), 2 deletions(-)
---
diff --git a/src/orca/script_utilities.py b/src/orca/script_utilities.py
index 290cb8874..c5015ec2a 100644
--- a/src/orca/script_utilities.py
+++ b/src/orca/script_utilities.py
@@ -3146,6 +3146,38 @@ class Utilities:
 
         return self._script.attributeNamesDict.get(attribName, attribName)
 
+    def getAllTextAttributesForObject(self, obj):
+        """Returns a list of (start, end, attrsDict) tuples for obj."""
+        try:
+            text = obj.queryText()
+        except:
+            return []
+
+        msg = "INFO: Getting all text attributes for %s" % obj
+        debug.println(debug.LEVEL_INFO, msg, True)
+
+        rv = []
+        offset = 0
+        while offset < text.characterCount:
+            attrList, start, end = text.getAttributeRun(offset)
+            if start == end:
+                msg = "INFO: start and end offsets should not be equal in attribute run"
+                debug.println(debug.LEVEL_INFO, msg, True)
+                break
+
+            if start < offset:
+                msg = "INFO: Unexpected start offset less than offset in attribute run"
+                debug.println(debug.LEVEL_INFO, msg, True)
+                break
+
+            attrDict = dict([attr.split(':', 1) for attr in attrList])
+            rv.append((start, end, attrDict))
+            offset = end
+
+        msg = "INFO: Result: %s" % rv
+        debug.println(debug.LEVEL_INFO, msg, True)
+        return rv
+
     def textAttributes(self, acc, offset=None, get_defaults=False):
         """Get the text attributes run for a given offset in a given accessible
 
@@ -3211,6 +3243,33 @@ class Utilities:
 
         return "%s: %s" % (localizedKey, localizedValue)
 
+    def getLanguageAndDialectForSubstring(self, obj, start, end):
+        """Returns a (language, dialect) tuple. If multiple languages apply to
+        the substring, language and dialect will be empty strings. Callers must
+        do any preprocessing to avoid that condition."""
+
+        allSubstrings = self.getLanguageAndDialectForObject(obj)
+        for startOffset, endOffset, language, dialect in allSubstrings:
+            if startOffset <= start and endOffset >= end:
+                return language, dialect
+
+        return "", ""
+
+    def getLanguageAndDialectForObject(self, obj):
+        """Returns a list of (start, end, language, dialect) tuples for obj.
+        This default implementation assumes there can be exactly one language
+        plus dialect that applies to the entire object. Support for apps in
+        which that assumption is not valid must override this method.
+        """
+
+        locale, encoding = obj.objectLocale.split(".")
+        if not locale:
+            locale, encoding = local.getdefaultlocale()
+
+        language, dialect = locale.split("_")
+        start, end = 0, -1
+        return [(start, end, language, dialect)]
+
     def willEchoCharacter(self, event):
         """Given a keyboard event containing an alphanumeric key,
         determine if the script is likely to echo it as a character.
diff --git a/src/orca/scripts/web/script_utilities.py b/src/orca/scripts/web/script_utilities.py
index 5509c8835..c603b1115 100644
--- a/src/orca/scripts/web/script_utilities.py
+++ b/src/orca/scripts/web/script_utilities.py
@@ -53,6 +53,8 @@ class Utilities(script_utilities.Utilities):
 
         self._objectAttributes = {}
         self._currentTextAttrs = {}
+        self._allTextAttrs = {}
+        self._languageAndDialects = {}
         self._caretContexts = {}
         self._priorContexts = {}
         self._contextPathsRolesAndNames = {}
@@ -152,6 +154,8 @@ class Utilities(script_utilities.Utilities):
     def clearCachedObjects(self):
         debug.println(debug.LEVEL_INFO, "WEB: cleaning up cached objects", True)
         self._objectAttributes = {}
+        self._allTextAttrs = {}
+        self._languageAndDialects = {}
         self._inDocumentContent = {}
         self._inTopLevelWebApp = {}
         self._isTextBlockElement = {}
@@ -932,6 +936,69 @@ class Utilities(script_utilities.Utilities):
 
         return super().localizeTextAttribute(key, value)
 
+    def getAllTextAttributesForObject(self, obj):
+        """Returns a list of (start, end, attrsDict) tuples for obj."""
+
+        if not (obj and self.inDocumentContent(obj)):
+            return super().getAllTextAttributesForObject(obj)
+
+        rv = self._allTextAttrs.get(hash(obj))
+        if rv is not None:
+            return rv
+
+        rv = super().getAllTextAttributesForObject(obj)
+        self._allTextAttrs[hash(obj)] = rv
+        return rv
+
+    def adjustContentsForLanguage(self, contents):
+        rv = []
+        for content in contents:
+            rv.extend(self.splitSubstringByLanguage(*content[0:3]))
+
+        return rv
+
+    def splitSubstringByLanguage(self, obj, start, end):
+        rv = []
+        allSubstrings = self.getLanguageAndDialectForObject(obj)
+        for startOffset, endOffset, language, dialect in allSubstrings:
+            if start > endOffset:
+                continue
+            if end <= startOffset:
+                break
+            string = self.substring(obj, startOffset, endOffset)
+            rv.append([obj, startOffset, endOffset, string])
+
+        return rv
+
+    def getLanguageAndDialectForObject(self, obj):
+        """Returns a list of (start, end, language, dialect) tuples for obj."""
+
+        if not self.inDocumentContent(obj):
+            return super().getLanguageAndDialectForObject(obj)
+
+        rv = self._languageAndDialects.get(hash(obj))
+        if rv is not None:
+            return rv
+
+        rv = []
+        attributeSet = self.getAllTextAttributesForObject(obj)
+        for (start, end, attrs) in attributeSet:
+            language = attrs.get("language", "")
+            dialect = ""
+            if "-" in language:
+                language, dialect = language.split("-")
+            rv.append((start, end, language, dialect))
+
+        # Embedded objects such as images and certain widgets won't implement the text interface
+        # and thus won't expose text attributes. Therefore try to get the info from the parent.
+        if not attributeSet:
+            start, end = self.getHyperlinkRange(obj)
+            language, dialect = self.getLanguageAndDialectForSubstring(obj.parent, start, end)
+            rv.append((0, 1, language, dialect))
+
+        self._languageAndDialects[hash(obj)] = rv
+        return rv
+
     def findObjectInContents(self, obj, offset, contents, usingCache=False):
         if not obj or not contents:
             return -1
@@ -1442,7 +1509,7 @@ class Utilities(script_utilities.Utilities):
             string = string[rangeStart:rangeEnd]
             end = start + len(string)
 
-        return [[obj, start, end, string]]
+        return self.adjustContentsForLanguage([[obj, start, end, string]])
 
     def getSentenceContentsAtOffset(self, obj, offset, useCache=True):
         if not obj:
@@ -1690,7 +1757,9 @@ class Utilities(script_utilities.Utilities):
                 extents = self.getExtents(acc, start, end)
             except:
                 extents = "(exception)"
-            msg = "     %i. chars: %i-%i: '%s' extents=%s\n" % (i, start, end, string, extents)
+            language, dialect = self.getLanguageAndDialectForSubstring(acc, start, end)
+            msg = "     %i. chars: %i-%i: '%s' extents=%s language='%s' dialect='%s'\n" % \
+                (i, start, end, string, extents, language, dialect)
             msg += debug.getAccessibleDetails(debug.LEVEL_INFO, acc, indent)
             debug.println(debug.LEVEL_INFO, msg, True)
[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]