add tools from distri, assign data sources

2016-06-08 15:03:13 +02:00
parent 7586087ecc
commit 607f7af7f8
34 changed files with 32174 additions and 12 deletions
--- a/tools/pyUANamespace/open62541_XMLPreprocessor.py
+++ b/tools/pyUANamespace/open62541_XMLPreprocessor.py
@ -0,0 +1,390 @@
+#!/usr/bin/env/python
+# -*- coding: utf-8 -*-
+
+###
+### Author:  Chris Iatrou (ichrispa@core-vector.net)
+###
+### This program was created for educational purposes and has been
+### contributed to the open62541 project by the author. All licensing
+### terms for this source is inherited by the terms and conditions
+### specified for by the open62541 project (see the projects readme
+### file for more information on the LGPL terms and restrictions).
+###
+### This program is not meant to be used in a production environment. The
+### author is not liable for any complications arising due to the use of
+### this program.
+###
+
+from logger import *
+from ua_constants import *
+import tempfile
+import xml.dom.minidom as dom
+import os
+import string
+from collections import Counter
+
+from ua_namespace import opcua_node_id_t
+
+class preProcessDocument:
+  originXML = '' # Original XML passed to the preprocessor
+  targetXML = () # tuple of (fileHandle, fileName)
+  nodeset   = '' # Parsed DOM XML object
+  parseOK   = False;
+  containedNodes  = [] # contains tuples of (opcua_node_id_t, xmlelement)
+  referencedNodes = [] # contains tuples of (opcua_node_id_t, xmlelement)
+  namespaceOrder  = [] # contains xmlns:sX attributed as tuples (int ns, string name)
+  namespaceQualifiers = []      # contains all xmlns:XYZ qualifiers that might prefix value aliases (like "<uax:Int32>")
+  referencedNamesSpaceUris = [] # contains <NamespaceUris> URI elements
+  
+  def __init__(self, originXML):
+    self.originXML = originXML
+    self.targetXML = tempfile.mkstemp(prefix=os.path.basename(originXML)+"_preProcessed-" ,suffix=".xml")
+    self.parseOK   = True
+    self.containedNodes  = []
+    self.referencedNodes = []
+    self.namespaceOrder  = []
+    self.referencedNamesSpaceUris = []
+    self.namespaceQualifiers = []
+    try:
+      self.nodeset = dom.parse(originXML)
+      if len(self.nodeset.getElementsByTagName("UANodeSet")) == 0 or len(self.nodeset.getElementsByTagName("UANodeSet")) > 1:
+        log(self, "Document " + self.targetXML[1] + " contains no or more then 1 nodeset", LOG_LEVEL_ERROR)
+        self.parseOK   = False
+    except:
+      self.parseOK   = False
+    log(self, "Adding new document to be preprocessed " + os.path.basename(originXML) + " as " + self.targetXML[1], LOG_LEVEL_DEBUG)
+  
+  def clean(self):
+    #os.close(self.targetXML[0]) Don't -> done to flush() after finalize()
+    os.remove(self.targetXML[1])
+  
+  def getTargetXMLName(self):
+    if (self.parseOK):
+      return self.targetXML[1]
+    return None
+  
+  def extractNamespaceURIs(self):
+    """ extractNamespaceURIs
+        
+        minidom gobbles up <NamespaceUris></NamespaceUris> elements, without a decent
+        way to reliably access this dom2 <uri></uri> elements (only attribute xmlns= are 
+        accessible using minidom).  We need them for dereferencing though... This 
+        function attempts to do just that.
+        
+        returns: Nothing
+    """
+    infile = open(self.originXML)
+    foundURIs = False
+    nsline = ""
+    line = infile.readline()
+    for line in infile:
+      if "<namespaceuris>" in line.lower():
+        foundURIs = True
+      elif "</namespaceuris>" in line.lower():
+        foundURIs = False
+        nsline = nsline + line
+        break
+      if foundURIs:
+        nsline = nsline + line
+    
+    if len(nsline) > 0:
+      ns = dom.parseString(nsline).getElementsByTagName("NamespaceUris")
+      for uri in ns[0].childNodes:
+        if uri.nodeType != uri.ELEMENT_NODE:
+          continue
+        self.referencedNamesSpaceUris.append(uri.firstChild.data)
+      
+    infile.close()
+    
+  def analyze(self):
+    """ analyze()
+    
+        analyze will gather information about the nodes and references contained in a XML File
+        to facilitate later preprocessing stages that adresss XML dependency issues
+        
+        returns: No return value
+    """ 
+    nodeIds = []
+    ns = self.nodeset.getElementsByTagName("UANodeSet")
+    
+    # We need to find out what the namespace calls itself and other referenced, as numeric id's are pretty
+    # useless sans linked nodes. There is two information sources...
+    self.extractNamespaceURIs() # From <URI>...</URI> definitions
+    
+    for key in ns[0].attributes.keys(): # from xmlns:sX attributes
+      if "xmlns:" in key:  # Any key: we will be removing these qualifiers from Values later
+        self.namespaceQualifiers.append(key.replace("xmlns:",""))
+      if "xmlns:s" in key: # get a numeric nsId and modelname/uri
+        self.namespaceOrder.append((int(key.replace("xmlns:s","")), ns[0].getAttribute(key)))
+    
+    # Get all nodeIds contained in this XML
+    for nd in ns[0].childNodes:
+      if nd.nodeType != nd.ELEMENT_NODE:
+        continue
+      if nd.hasAttribute(u'NodeId'):
+        self.containedNodes.append( (opcua_node_id_t(nd.getAttribute(u'NodeId')), nd) )
+        refs = nd.getElementsByTagName(u'References')[0]
+        for ref in refs.childNodes:
+          if ref.nodeType == ref.ELEMENT_NODE:
+            self.referencedNodes.append( (opcua_node_id_t(ref.firstChild.data), ref) )
+    
+    log(self, "Nodes: " + str(len(self.containedNodes)) + " References: " + str(len(self.referencedNodes)), LOG_LEVEL_DEBUG)
+  
+  def getNamespaceId(self):
+    """ namespaceId()
+        
+        Counts the namespace IDs in all nodes of this XML and picks the most used
+        namespace as the numeric identifier of this data model.
+        
+        returns: Integer ID of the most propable/most used namespace in this XML
+    """
+    max = 0;
+    namespaceIdGuessed = 0;
+    idDict = {}
+    
+    for ndid in self.containedNodes:
+      if not idDict.has_key(ndid[0].ns):
+        idDict[ndid[0].ns] = 1
+      else:
+        idDict[ndid[0].ns] = idDict[ndid[0].ns] + 1
+    
+    for entry in idDict:
+      if idDict[entry] > max:
+        max = idDict[entry]
+        namespaceIdGuessed = entry
+    log(self, "XML Contents are propably in namespace " + str(entry) + " (used by " + str(idDict[entry]) + " Nodes)", LOG_LEVEL_DEBUG)
+    return namespaceIdGuessed
+  
+  def getReferencedNamespaceUri(self, nsId):
+    """ getReferencedNamespaceUri
+    
+        returns an URL that hopefully corresponds to the nsId that was used to reference this model
+        
+        return: URI string corresponding to nsId
+    """
+    # Might be the more reliable method: Get the URI from the xmlns attributes (they have numers)
+    if len(self.namespaceOrder) > 0:
+      for el in self.namespaceOrder:
+        if el[0] == nsId:
+          return el[1]
+    
+    # Fallback: 
+    #  Some models do not have xmlns:sX attributes, but still <URI>s (usually when they only reference NS0)
+    if len(self.referencedNamesSpaceUris) > 0  and len(self.referencedNamesSpaceUris) >= nsId-1:
+      return self.referencedNamesSpaceUris[nsId-1]
+    
+    #Nope, not found.
+    return ""
+  
+  def getNamespaceDependencies(self):
+    deps = []
+    for ndid in self.referencedNodes:
+      if not ndid[0].ns in deps:
+        deps.append(ndid[0].ns)
+    return deps
+    
+  def finalize(self):
+    outfile = self.targetXML[0]
+    outline = self.nodeset.toxml()
+    for qualifier in self.namespaceQualifiers:
+      rq = qualifier+":"
+      outline = outline.replace(rq.decode('UTF-8'), "")
+    os.write(outfile, outline.encode('UTF-8'))
+    os.close(outfile)
+    
+  def reassignReferencedNamespaceId(self, currentNsId, newNsId):
+    """ reassignReferencedNamespaceId
+        
+        Iterates over all references in this document, find references to currentNsId and changes them to newNsId.
+        NodeIds themselves are not altered.
+        
+        returns: nothing
+    """ 
+    for refNd in self.referencedNodes:
+      if refNd[0].ns == currentNsId:
+        refNd[1].firstChild.data = refNd[1].firstChild.data.replace("ns="+str(currentNsId), "ns="+str(newNsId))
+        refNd[0].ns = newNsId
+        refNd[0].toString()
+  
+  def reassignNamespaceId(self, currentNsId, newNsId):
+    """ reassignNamespaceId
+        
+        Iterates over all nodes in this document, find those in namespace currentNsId and changes them to newNsId.
+        
+        returns: nothing
+    """ 
+    log(self, "Migrating nodes /w ns index " + str(currentNsId) + " to " + str(newNsId), LOG_LEVEL_DEBUG)
+    for nd in self.containedNodes:
+      if nd[0].ns == currentNsId:
+        # In our own document, update any references to this node
+        for refNd in self.referencedNodes:
+          if refNd[0].ns == currentNsId and refNd[0] == nd[0]:
+            refNd[1].firstChild.data = refNd[1].firstChild.data.replace("ns="+str(currentNsId), "ns="+str(newNsId))
+            refNd[0].ns = newNsId
+            refNd[0].toString()
+        nd[1].setAttribute(u'NodeId', nd[1].getAttribute(u'NodeId').replace("ns="+str(currentNsId), "ns="+str(newNsId)))
+        nd[0].ns = newNsId
+        nd[0].toString()
+  
+class open62541_XMLPreprocessor:
+  preProcDocuments = []
+  
+  def __init__(self):
+      self.preProcDocuments = []
+      
+  def addDocument(self, documentPath):
+    self.preProcDocuments.append(preProcessDocument(documentPath))
+    
+  def removePreprocessedFiles(self):
+    for doc in self.preProcDocuments:
+      doc.clean()
+  
+  def getPreProcessedFiles(self):
+    files = []
+    for doc in self.preProcDocuments:
+      if (doc.parseOK):
+        files.append(doc.getTargetXMLName())
+    return files
+  
+  def testModelCongruencyAgainstReferences(self, doc, refs):
+    """ testModelCongruencyAgainstReferences
+    
+        Counts how many of the nodes referencef in refs can be found in the model
+        doc.
+        
+        returns: double corresponding to the percentage of hits
+    """
+    sspace = len(refs)
+    if sspace == 0:
+      return float(0)
+    found   = 0
+    for ref in refs:
+      for n in doc.containedNodes:
+        if str(ref) == str(n[0]):
+          print ref, n[0]
+          found = found + 1
+          break
+    return float(found)/float(sspace)
+    
+  def preprocess_assignUniqueNsIds(self):
+    nsdep  = []
+    docLst = []
+    # Search for namespace 0('s) - plural possible if user is overwriting NS0 defaults
+    # Remove them from the list of namespaces, zero does not get demangled
+    for doc in self.preProcDocuments:
+      if doc.getNamespaceId() == 0:
+        docLst.append(doc)
+    for doc in docLst:
+      self.preProcDocuments.remove(doc)
+    
+    # Reassign namespace id's to be in ascending order
+    nsidx = 1 # next namespace id to assign on collision (first one will be "2")
+    for doc in self.preProcDocuments:
+      nsidx = nsidx + 1
+      nsid = doc.getNamespaceId()
+      doc.reassignNamespaceId(nsid, nsidx)
+      docLst.append(doc)
+      log(self, "Document " + doc.originXML + " is now namespace " + str(nsidx), LOG_LEVEL_INFO)
+    self.preProcDocuments = docLst
+  
+  def getUsedNamespaceArrayNames(self):
+    """ getUsedNamespaceArrayNames
+    
+        Returns the XML xmlns:s1 or <URI>[0] of each XML document (if contained/possible)
+        
+        returns: dict of int:nsId -> string:url
+    """
+    nsName = {}
+    for doc in self.preProcDocuments:
+      uri = doc.getReferencedNamespaceUri(1)
+      if uri == None:
+        uri = "http://modeluri.not/retrievable/from/xml"
+      nsName[doc.getNamespaceId()] = doc.getReferencedNamespaceUri(1)
+    return nsName
+      
+  def preprocess_linkDependantModels(self):    
+    revertToStochastic = [] # (doc, int id), where id was not resolvable using model URIs
+    
+    # Attemp to identify the model relations by using model URIs in xmlns:sX or <URI> contents
+    for doc in self.preProcDocuments:
+      nsid = doc.getNamespaceId()
+      dependencies = doc.getNamespaceDependencies()
+      for d in dependencies:
+        if d != nsid and d != 0:
+          # Attempt to identify the namespace URI this d referes to...
+          nsUri = doc.getReferencedNamespaceUri(d) # FIXME: This could actually fail and return ""!
+          log(self, "Need a namespace referenced as " + str(d) + ". Which hopefully is " + nsUri, LOG_LEVEL_INFO)
+          targetDoc = None
+          for tgt in self.preProcDocuments:
+            # That model, whose URI is known but its current id is not, will 
+            #   refer have referred to itself as "1"
+            if tgt.getReferencedNamespaceUri(1) == nsUri:
+              targetDoc = tgt
+              break
+          if not targetDoc == None:
+            # Found the model... relink the references
+            doc.reassignReferencedNamespaceId(d, targetDoc.getNamespaceId())
+            continue
+          else:
+            revertToStochastic.append((doc, d)) 
+            log(self, "Failed to reliably identify which XML/Model " + os.path.basename(doc.originXML) + " calls ns=" +str(d), LOG_LEVEL_WARN)
+    
+    for (doc, d) in revertToStochastic:
+      log(self, "Attempting to find stochastic match for target namespace ns=" + str(d) + " of " + os.path.basename(doc.originXML), LOG_LEVEL_WARN)
+      # Copy all references to the given namespace
+      refs = []
+      matches = [] # list of (match%, targetDoc) to pick from later
+      for ref in doc.referencedNodes:
+        if ref[0].ns == d:
+          refs.append(opcua_node_id_t(str(ref[0])))
+      for tDoc in self.preProcDocuments:
+        tDocId = tDoc.getNamespaceId()
+        # Scenario: If these references did target this documents namespace...
+        for r in refs:
+          r.ns = tDocId
+          r.toString()
+        # ... how many of them would be found!?
+        c = self.testModelCongruencyAgainstReferences(tDoc, refs)
+        print c
+        if c>0:
+          matches.append(c, tDoc)
+      best = (0, None)
+      for m in matches:
+        print m[0]
+        if m[0] > best[0]:
+          best = m
+      if best[1] != None:
+        log(self, "Best match (" + str(best[1]*100) + "%) for what " + os.path.basename(doc.originXML) + " refers to as ns="+str(d)+" was " + os.path.basename(best[1].originXML), LOG_LEVEL_WARN)
+        doc.reassignReferencedNamespaceId(d, best[1].getNamespaceId())
+      else: 
+        log(self, "Failed to find a match for what " +  os.path.basename(doc.originXML) + " refers to as ns=" + str(d) ,LOG_LEVEL_ERROR )
+      
+  def preprocessAll(self):
+    ##
+    ## First: Gather statistics about the namespaces:
+    for doc in self.preProcDocuments:
+      doc.analyze()
+    
+    # Preprocess step: Remove XML specific Naming scheme ("uax:")
+    # FIXME: Not implemented
+    
+    ##
+    ## Preprocess step: Check namespace ID multiplicity and reassign IDs if necessary
+    ##
+    self.preprocess_assignUniqueNsIds()
+    self.preprocess_linkDependantModels()
+    
+    
+    ##  
+    ## Prep step: prevent any XML from using namespace 1 (reserved for instances)
+    ## FIXME: Not implemented
+    
+    ##
+    ## Final: Write modified XML tmp files
+    for doc in self.preProcDocuments:
+      doc.finalize()
+    
+    return True
+      
+  
+