3.4. Serialization XML, XSLT, XPath

3.4.1. xml module from standard library

xml module from standard library:

from xml.etree.ElementTree import parse


FILE = r'_temporary.xml'
# <execute>
#     <command timeout="2">/bin/ls -la /etc/</command>
#     <command>/bin/ls -l /home/ /tmp/</command>
#     <command timeout="1">/bin/sleep 2</command>
#     <command timeout="2">/bin/echo 'hello'</command>
# </execute>

root = parse(FILE).getroot()

for command in root.findall('./command'):
    print(command.tag)
    print(command.text)
    print(command.attrib)
    print()

# command
# /bin/ls -la /etc/
# {'timeout': '2'}
#
# command
# /bin/ls -l /home/ /tmp/
# {}
#
# command
# /bin/sleep 2
# {'timeout': '1'}
#
# command
# /bin/echo 'hello'
# {'timeout': '2'}

3.4.2. lxml module

  • pip install lxml

3.4.3. Creating elements

Creating elements:

from lxml.etree import tostring, Element


root = Element("iris")

print(tostring(root))
# b'<iris/>'

Adding elements using list interface:

from lxml.etree import tostring, Element


root = Element('iris')
root.append(Element('setosa'))
root.append(Element('versicolor'))
root.append(Element('virginica'))

print(tostring(root))
# b'<iris><setosa/><versicolor/><virginica/></iris>'

3.4.4. Length of a subtree

Length of a subtree:

from lxml.etree import Element


root = Element('iris')
root.append(Element('setosa'))
root.append(Element('versicolor'))
root.append(Element('virginica'))

print(len(root))
# 3

3.4.5. Selecting subtree

Selecting subtree:

from lxml.etree import Element


root = Element('iris')
root.append(Element('setosa'))
root.append(Element('versicolor'))
root.append(Element('virginica'))

selected = root[2]
print(selected.tag)
# virginica

Where is selected element:

from lxml.etree import Element


root = Element('iris')
root.append(Element('setosa'))
root.append(Element('versicolor'))
root.append(Element('virginica'))

selected = root[1]
root.index(selected)
# 1

selected = root[2]
root.index(selected)
# 2

3.4.6. Element tree as a lists

Elements are lists:

from lxml.etree import tostring, Element


root = Element('iris")
root.append(Element('setosa"))
root.append(Element('versicolor"))
root.append(Element('virginica"))

children = list(root)
print(children)
# [
#     <Element setosa at 0x113cd4048>,
#     <Element versicolor at 0x113cd4188>,
#     <Element virginica at 0x113cd41c8>
# ]

Iterating over elements:

from lxml.etree import Element


root = Element("iris")
root.append(Element("setosa"))
root.append(Element("versicolor"))
root.append(Element("virginica"))

for child in root:
    print(child.tag)

# setosa
# versicolor
# virginica

Slicing elements:

from lxml.etree import Element


root = Element("iris")
root.append(Element("setosa"))
root.append(Element("versicolor"))
root.append(Element("virginica"))

root.insert(0, Element("arctica"))

start = root[:1]
end = root[-1:]

print(start[0].tag)  # arctica
print(end[0].tag)    # virginica

3.4.7. Elements as a dict

Create element using dict interface:

from lxml.etree import tostring, Element


tag = Element("iris", kingdom="plantae")

print(tostring(tag))
# b'<iris kingdom="plantae"/>'

Get element attributes and values:

from lxml.etree import tostring, Element


tag = Element("iris", kingdom="plantae")

print(tag.get("kingdom"))          # plantae
print(tag.get("not-existing"))     # None

Set element attributes and values:

from lxml.etree import tostring, Element


tag = Element("iris", kingdom="plantae")
tag.set("kind", "flower")

print(tag.get("kind"))
# flower

print(tostring(tag))
# b'<iris kingdom="plantae" kind="flower"/>'

Elements carry attributes as a dict:

from lxml.etree import Element


tag = Element("iris", kingdom="plantae")
tag.set("kind", "flower")

tag.keys()
# ['kind', 'kingdom']

tag.values()
# ['plantae', 'flower']

tag.items()
# [('kingdom', 'plantae'), ('kind', 'flower')]

Iterating over element attributes and values:

from lxml.etree import Element


tag = Element("iris", kingdom="plantae")
tag.set("kind", "flower")

for key, value in tag.items():
    print(f'{key} -> {value}')

# kingdom -> plantae
# kind -> flower

Elements carry attributes as a dict:

from lxml.etree import Element


tag = Element("iris", kingdom="plantae")
tag.set("kind", "flower")

tag.attrib['kingdom']
# 'plantae'

tag.attrib['not-existing']
# Traceback (most recent call last):
# KeyError: 'not-existing'

tag.attrib['species'] = 'Setosa'
tag.attrib.get('species')
# 'Setosa'

tag.attrib
# {'kingdom': 'plantae', 'kind': 'flower'}

tag.attrib.items()
# [('kingdom', 'plantae'), ('kind', 'flower'), ('species', 'Setosa')]

3.4.8. Elements contain text

from lxml.etree import tostring, Element

tag = Element("iris")
tag.text = "Setosa"

tag.text
# 'Setosa'

tostring(tag)
# b'<iris>Setosa</iris>'

3.4.9. Tree iteration

from lxml.etree import tostring, Element, SubElement

root = Element("iris")
SubElement(root, "species").text = "Setosa"
SubElement(root, "species").text = "Virginica"
SubElement(root, "flower").text = "Versicolor"

print(tostring(root, pretty_print=True))
# b'<iris>
#       <species>Setosa</species>
#       <species>Virginica</species>
#       <flower>Versicolor</flower>
# </iris>'


for element in root.iter():
    print(f'{element.tag} -> {element.text}')

# iris -> None
# species -> Setosa
# species -> Virginica
# flower -> Versicolor


for element in root.iter("species"):
    print(f'{element.tag} -> {element.text}')

# species -> Setosa
# species -> Virginica


for element in root.iter("species", "flower"):
    print(f'{element.tag} -> {element.text}')

# species -> Setosa
# species -> Virginica
# flower -> Versicolor

3.4.10. Entities

from lxml.etree import tostring, Element, SubElement, Entity

root = Element("iris")
print(tostring(root))
# b'<iris/>'

root.append(Entity("#234"))
print(tostring(root))
# b'<iris>&#234;</iris>'

3.4.11. Comments

from lxml.etree import tostring, Element, SubElement, Comment

root = Element("iris")
print(tostring(root))
# b'<iris/>'

root.append(Comment("Hello World"))
print(tostring(root))
# b'<iris><!--Hello World--></iris>'
from lxml.etree import tostring, Element, SubElement

root = Element('iris')
SubElement(root, 'species').text = 'setosa'
SubElement(root, 'species').text = 'virginica'
SubElement(root, 'flower').text = 'versicolor'

print(tostring(root))
# b'<iris><species>setosa</species><species>virginica</species><flower>versicolor</flower></iris>'
from lxml.etree import tostring, Element, Entity

root = Element('iris')
root.append(Entity('#234'))

print(tostring(root))
# b'<iris>&#234;</iris>'
from lxml.etree import tostring, Element, Comment

root = Element('iris')
root.append(Comment('Hello World'))
print(tostring(root))
# b'<iris><!--Hello World--></iris>'
from lxml.etree import tostring, Element, Entity, Comment

root = Element('iris')
root.append(Element('species'))
root.append(Element('species'))
root.append(Element('flower'))
root.append(Entity('#234'))
root.append(Comment('Hello World'))

print(tostring(root))
# b'<iris><species/><species/><flower/>&#234;<!--Hello World--></iris>'


for element in root.iter():
    if isinstance(element.tag, str):
        print(f'Tag: {element.tag} -> {element.text}')
    else:
        print(f'Special: {element} -> {element.text}')

# Tag: iris -> None
# Tag: species -> None
# Tag: species -> None
# Tag: flower -> None
# Special: &#234; -> &#234;
# Special: <!--Hello World--> -> Hello World


for element in root.iter(tag=Element):
        print(f'{element.tag} -> {element.text}')

# iris -> None
# species -> None
# species -> None
# flower -> None


for element in root.iter(tag=Entity):
    print(element.text)

# &#234;


for element in root.iter(tag=Comment):
    print(element.text)

# Hello World

3.4.12. Serialization

from lxml.etree import tostring, XML


root = XML('<root><a><b/></a></root>')

tostring(root)
# b'<root><a><b/></a></root>'

print(tostring(root, xml_declaration=True))
# b"<?xml version='1.0' encoding='ASCII'?>\n<root><a><b/></a></root>"

print(tostring(root, encoding='utf-8'))
# b'<root><a><b/></a></root>'

print(tostring(root, encoding='iso-8859-2'))
# b"<?xml version='1.0' encoding='iso-8859-2'?>\n<root><a><b/></a></root>"

print(tostring(root, pretty_print=True))
# b'<root>\n  <a>\n    <b/>\n  </a>\n</root>\n'

print(tostring(root, pretty_print=True).decode())
# <root>
#   <a>
#     <b/>
#   </a>
# </root>
from lxml.etree import tostring, XML

root = XML('<html><head/><body><p>Hello<br/>World</p></body></html>')

# default: method = 'xml'
tostring(root)
# b'<html><head/><body><p>Hello<br/>World</p></body></html>'

tostring(root, method='xml')
# b'<html><head/><body><p>Hello<br/>World</p></body></html>'

tostring(root, method='html')
# b'<html><head></head><body><p>Hello<br>World</p></body></html>'

print(tostring(root, method='html', pretty_print=True))
# b'<html>\n<head></head>\n<body><p>Hello<br>World</p></body>\n</html>\n'

print(tostring(root, method='html', pretty_print=True).decode())
# <html>
# <head></head>
# <body><p>Hello<br>World</p></body>
# </html>

tostring(root, method='text')
# b'HelloWorld'

3.4.13. Working with HTML

  • Using lxml module

<html><body>Iris<br/>Setosa</body></html>
from lxml.etree import tostring, Element, SubElement


html = Element("html")
body = SubElement(html, "body")

body.text = "Iris"
tostring(html)
# b'<html><body>Iris</body></html>'

br = SubElement(body, "br")
tostring(html)
# b'<html><body>Iris<br/></body></html>'

br.tail = 'Setosa'
tostring(html)
# b'<html><body>Iris<br/>Setosa</body></html>'

3.4.14. XPATH

  • Using lxml module

print(html.xpath("string()")) # lxml.etree only!
# IrisSetosa

print(html.xpath("//text()")) # lxml.etree only!
# ['Iris', 'Setosa']

3.4.15. XSLT

  • Using lxml module

3.4.16. Example 1

from io import StringIO
from lxml.etree import XML, XSLT, parse


TEMPLATE = """
    <xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
        <xsl:template match="/">

            <my_tag>
                <xsl:value-of select="/outer/inner/text()" />
            </my_tag>

        </xsl:template>
    </xsl:stylesheet>
"""

DATA = """
    <outer>
        <inner>Hello World</inner>
    </outer>
"""

transform = XSLT(XML(TEMPLATE))
data = parse(StringIO(DATA))
result = transform(data)

print(result)
# <?xml version="1.0"?>
# <my_tag>Hello World</my_tag>

3.4.17. Example 2

from io import StringIO
from lxml.etree import XML, XSLT, parse


DATA = """
    <astronauts>
        <astro>
            <firstname>Jan</firstname>
            <lastname>Twardowski</lastname>
        </astro>
        <astro>
            <firstname>Mark</firstname>
            <lastname>Watney</lastname>
        </astro>
    </astronauts>
"""

TEMPLATE = """
    <html xsl:version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
        <table>
            <thead>
                <tr>
                    <th>First Name</th>
                    <th>Last Name</th>
                </tr>
            </thead>
            <tbody>

                <xsl:for-each select="astronauts/astro">
                    <tr>
                        <td><xsl:value-of select="firstname"/></td>
                        <td><xsl:value-of select="lastname"/></td>
                    </tr>
                </xsl:for-each>

            </tbody>
        </table>
    </html>
"""

transform = XSLT(XML(TEMPLATE))
data = parse(StringIO(DATA))
result = transform(data)

print(result)
# <html><table>
# <thead><tr>
# <th>First Name</th>
# <th>Last Name</th>
# </tr></thead>
# <tbody>
# <tr>
# <td>Jan</td>
# <td>Twardowski</td>
# </tr>
# <tr>
# <td>Mark</td>
# <td>Watney</td>
# </tr>
# </tbody>
# </table></html>

3.4.18. Example 3

from io import StringIO
from lxml.etree import XML, XSLT, parse


DATA = """
    <CATALOG>
        <PLANT>
            <COMMON>Bloodroot</COMMON>
            <BOTANICAL>Sanguinaria canadensis</BOTANICAL>
            <ZONE>4</ZONE>
            <LIGHT>Mostly Shady</LIGHT>
            <PRICE>$2.44</PRICE>
            <AVAILABILITY>031599</AVAILABILITY>
        </PLANT>
        <PLANT>
            <COMMON>Columbine</COMMON>
            <BOTANICAL>Aquilegia canadensis</BOTANICAL>
            <ZONE>3</ZONE>
            <LIGHT>Mostly Shady</LIGHT>
            <PRICE>$9.37</PRICE>
            <AVAILABILITY>030699</AVAILABILITY>
        </PLANT>
    </CATALOG>
"""

TEMPLATE = """
    <html xsl:version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">

    <style>
        body {font-family: Arial; font-size: 1em; background-color: #EEEEEE}
        div.title {background-color: teal; color: white; padding: 4px}
        div.description {margin-left:20px;margin-bottom:1em;font-size:10pt}
        span {font-weight: bold}
    </style>

    <body>

    <xsl:for-each select="CATALOG/PLANT">

        <div class="title">
            <span><xsl:value-of select="BOTANICAL"/></span>
            <xsl:value-of select="PRICE"/>
        </div>

        <div class="description">
                <xsl:value-of select="description"/>
                <span> (<xsl:value-of select="AVAILABILITY"/> will be available)</span>
        </div>

    </xsl:for-each>
    </body>
    </html>
"""

transform = XSLT(XML(TEMPLATE))
data = parse(StringIO(DATA))
result = transform(data)

print(result)
# <html>
# <style>
#     body {font-family: Arial; font-size: 1em; background-color: #EEEEEE}
#     div.title {background-color: teal; color: white; padding: 4px}
#     div.description {margin-left:20px;margin-bottom:1em;font-size:10pt}
#     span {font-weight: bold}
# </style>
# <body>
# <div class="title">
# <span>Sanguinaria canadensis</span>$2.44</div>
# <div class="description"><span> (031599 will be available)</span></div>
# <div class="title">
# <span>Aquilegia canadensis</span>$9.37</div>
# <div class="description"><span> (030699 will be available)</span></div>
# </body>
# </html>

3.4.19. Assignments

Code 3.32. Solution
"""
* Assignment: Serialization XML Parsing
* Complexity: easy
* Lines of code: 20 lines
* Time: 13 min

English:
    1. Use data from "Given" section (see below)
    2. Convert input data to `list[dict]`

Polish:
    1. Użyj danych z sekcji "Given" (patrz poniżej)
    2. Przekonwertuj dane wejściowe do `list[dict]`
"""


# Given
DATA = """<?xml version="1.0" encoding="UTF-8"?>
<CATALOG>
    <PLANT>
        <COMMON>Bloodroot</COMMON>
        <BOTANICAL>Sanguinaria canadensis</BOTANICAL>
        <ZONE>4</ZONE>
        <LIGHT>Mostly Shady</LIGHT>
        <PRICE>$2.44</PRICE>
        <AVAILABILITY>031599</AVAILABILITY>
    </PLANT>
    <PLANT>
        <COMMON>Columbine</COMMON>
        <BOTANICAL>Aquilegia canadensis</BOTANICAL>
        <ZONE>3</ZONE>
        <LIGHT>Mostly Shady</LIGHT>
        <PRICE>$9.37</PRICE>
        <AVAILABILITY>030699</AVAILABILITY>
    </PLANT>
    <PLANT>
        <COMMON>Marsh Marigold</COMMON>
        <BOTANICAL>Caltha palustris</BOTANICAL>
        <ZONE>4</ZONE>
        <LIGHT>Mostly Sunny</LIGHT>
        <PRICE>$6.81</PRICE>
        <AVAILABILITY>051799</AVAILABILITY>
    </PLANT>
    <PLANT>
        <COMMON>Cowslip</COMMON>
        <BOTANICAL>Caltha palustris</BOTANICAL>
        <ZONE>4</ZONE>
        <LIGHT>Mostly Shady</LIGHT>
        <PRICE>$9.90</PRICE>
        <AVAILABILITY>030699</AVAILABILITY>
    </PLANT>
<CATALOG>"""


Code 3.33. Solution
"""
* Assignment: Serialization XSLT Transformation
* Complexity: medium
* Lines of code: 5 lines
* Time: 13 min

English:
    1. Use data from "Given" section (see below)
    2. Convert input data to `list[dict]`

Polish:
    1. Użyj danych z sekcji "Given" (patrz poniżej)
    2. Przekonwertuj dane wejściowe do `list[dict]`
"""


# Given
DATA = """<?xml version="1.0" encoding="UTF-8"?>
<breakfast_menu>
    <food>
        <name>Belgian Waffles</name>
        <price>$5.95</price>
        <description>Two of our famous Belgian Waffles with plenty of real maple syrup</description>
        <calories>650</calories>
    </food>
    <food>
        <name>Strawberry Belgian Waffles</name>
        <price>$7.95</price>
        <description>Light Belgian waffles covered with strawberries and whipped cream</description>
        <calories>900</calories>
    </food>
    <food>
        <name>Berry-Berry Belgian Waffles</name>
        <price>$8.95</price>
        <description>Light Belgian waffles covered with an assortment of fresh berries and whipped cream</description>
        <calories>900</calories>
    </food>
    <food>
        <name>French Toast</name>
        <price>$4.50</price>
        <description>Thick slices made from our homemade sourdough bread</description>
        <calories>600</calories>
    </food>
    <food>
        <name>Homestyle Breakfast</name>
        <price>$6.95</price>
        <description>Two eggs, bacon or sausage, toast, and our ever-popular hash browns</description>
        <calories>950</calories>
    </food>
</breakfast_menu>
"""