from lxml import etree, html from io import BytesIO def convertHTML(source: str, sourceFrom: str): htmlParser = html.HTMLParser(remove_comments=True, remove_blank_text=True) xmlParser = etree.XMLParser(remove_comments=True, remove_blank_text=True) if sourceFrom == "xml": xmldoc = etree.parse(BytesIO(source.encode("utf-8")), xmlParser) return html.tostring(xmldoc, method="html", pretty_print=True, doctype="").decode() elif sourceFrom == "html": htmldoc = html.parse(BytesIO(source.encode("utf-8")), htmlParser) return etree.tostring(htmldoc, method="xml", pretty_print=True, doctype="", xml_declaration=True, encoding="utf-8").decode() else: return def formatHTML(source: str, prettify: bool) -> str: parser = html.HTMLParser(remove_blank_text=True, remove_comments=True, remove_pis=True) htmlDoc = html.parse(BytesIO(source.encode("utf-8")),parser=parser) if not prettify: return html.tostring(htmlDoc).decode().replace("\n", "").replace("> ", ">") return etree.tostring(htmlDoc, encoding='unicode', pretty_print=True) def formatXML(source: str, prettify: bool) -> str: """Method used to format XML :param source: XML to format :param prettify: sets if XML must be prettified (added indentations etc.) or not :return: formatted XML """ # Prolog is removed when XML is parsed # so program has to copy it prolog = "" prolog_start = source.find("") + 2 prolog = source[prolog_start:prolog_end] source = source[prolog_end: ] byte_input = BytesIO(source.encode("utf-8")) parser = etree.XMLParser(remove_blank_text=True) xml = etree.parse(byte_input, parser=parser) if prettify: prolog += "\n" return prolog + etree.tostring(xml, pretty_print=prettify).decode() def xpath(source: str, xpath: str) -> str: """ Method used to get nodes from XML string using XPath :param source: XML string used for selection :param xpath: XPath query used for selection :return: Nodes selected using XPath """ byte_input = BytesIO(source.encode("utf-8")) root = etree.parse(byte_input).getroot() nsmap = root.nsmap # LXML doesn't accept empty (None) namespace prefix, # so it need to be deleted if exists if None in nsmap: nsmap.pop(None) result = root.xpath(xpath, namespaces=nsmap) # root.xpath can return 4 types: float, string, bool and list. # List is the only one that can't be simply converted to str if type(result) is not list: return str(result), type(result).__name__ else: result_string = "" for e in result: result_string += etree.tostring(e, pretty_print=True).decode() + "\n" return result_string, "node" def xsd(source: str, xsd: str) -> bool: """ Method used to validate XML string against XSD schema :param source: XML string used for validation :param xsd: XSD schema to validate XML against :return: Message saying, if the validation was successful or not """ schema_input = BytesIO(xsd.encode("utf-8")) xml_schema = etree.XMLSchema(etree.parse(schema_input).getroot()) document_input = BytesIO(source.encode("utf-8")) xml = etree.parse(document_input).getroot() try: xml_schema.assertValid(xml) return "XML is valid" except etree.DocumentInvalid as e: return str(e) def xslt(source: str, xslt: str) -> str: """ Method used to transform XML string using XSLT :param source: XML string to transform :param xslt: XSLT string used to transform XML :return: Result of transformation """ xslt_input = BytesIO(xslt.encode("utf-8")) xslt_transform = etree.XSLT(etree.parse(xslt_input)) document_input = BytesIO(source.encode("utf-8")) xml = etree.parse(document_input).getroot() transformed = str(xslt_transform(xml)) return formatXML(transformed, True)