I was thinking again about data serialization in Python after
reading an
interesting post that suggests
using functools.singledispatch
as a way of
encapsulating JSON encoding per datatype and it got me thinking
again about XML.
A while back I wrote up this small utility method for serializing data to XML:
import dataclasses
import xml.dom.minidom
def to_xml(obj):
def build(parent, obj):
if dataclasses.is_dataclass(obj):
for key in dataclasses.asdict(obj):
tag = document.createElement(key)
parent.appendChild(tag)
build(tag, getattr(obj, key))
elif type(obj) == list:
for elem in obj:
tag = document.createElement(type(elem).__name__.lower())
build(tag, elem)
parent.appendChild(tag)
elif type(obj) == dict:
for key in obj:
tag = document.createElement(key)
parent.appendChild(tag)
build(tag, obj[key])
else:
data = str(obj)
tag = document.createTextNode(data)
parent.appendChild(tag)
document = xml.dom.minidom.Document()
build(document, obj)
return document
It suffers a few limitations that I hand-waved away as being not worth the effort of fixing because I was happy enough with the results. Among the limitations are the glaring failure to handle non-dictionary types at the top-level. I can't remember why I did that but think it was just an oversight on my part because there's no good reason to do that. Secondly, the tag names produced by introspecting the type of bare lists means the output is inconsistent over different inputs:
>>> d = {'some_list': [1,2,3]}
>>> print(to_xml({'example': d}).toprettyxml(indent=" "))
<?xml version="1.0" ?>
<example>
<some_list>
<int>1</int>
<int>2</int>
<int>3</int>
</some_list>
</example>
>>> e = {'some_list': [1,2,3.0]}
>>> print(to_xml({'example': e}).toprettyxml(indent=" "))
<?xml version="1.0" ?>
<example>
<some_list>
<int>1</int>
<int>2</int>
<float>3.0</float>
</some_list>
</example>
I got to thinking that while it wouldn't be hard to replace the tags that are driven by introspection with a constant name like "value" or "item" I would prefer that as a fallback to something more explicit. I wondered if I could look for a type hint and use it as the tag name? My hope was that I could start to leverage type hints as more than just documentation and reflect them into the serialized data as well.
First up then, removing the need for the top-level dictionary type. The most obvious thing I could think of was to explicitly add the root element before recursing:
def to_xml(obj):
def build(parent, obj):
if dataclasses.is_dataclass(obj):
for key in dataclasses.asdict(obj):
tag = document.createElement(key)
parent.appendChild(tag)
build(tag, getattr(obj, key))
elif type(obj) == list:
for elem in obj:
tag = document.createElement(type(elem).__name__.lower())
build(tag, elem)
parent.appendChild(tag)
elif type(obj) == dict:
for key in obj:
tag = document.createElement(key)
parent.appendChild(tag)
build(tag, obj[key])
else:
data = str(obj)
tag = document.createTextNode(data)
parent.appendChild(tag)
document = xml.dom.minidom.Document()
root_tag_name = type(obj).__name__.lower()
root = document.createElement(root_tag_name)
document.appendChild(root)
build(root, obj)
return document
Ahh, immediately at least a little better:
@dataclasses.dataclass
class Bar:
qualities: list[str]
document = Bar(qualities=['poor', 'good', 'fair'])
doc = to_xml(document)
<?xml version="1.0" ?>
<bar>
<qualities>
<str>poor</str>
<str>good</str>
<str>fair</str>
</qualities>
</bar>
Not quite pertinent to improving the XML serializer but I was pleased to learn that for categorical datatypes the use of an enumerated type already does "the right thing" (to my eye):
import enum
class Quality(enum.StrEnum):
POOR = enum.auto()
FAIR = enum.auto()
GOOD = enum.auto()
EXCELLENT = enum.auto()
@dataclasses.dataclass
class Example:
qualities: list[str]
document = Example(qualities=[Quality.FAIR, Quality.GOOD])
doc = to_xml(document)
<?xml version="1.0" ?>
<example>
<qualities>
<quality>fair</quality>
<quality>good</quality>
</qualities>
</example>
Addressing the list type tags though requires a touch more work. I
went rummaging around the typing module and
surfaced typing.get_type_hints
which acts on functions, methods, modules, and classes. I'm
admittedly a bit of a novice to the kind of introspection that the
typing module relies on but adding an optional type hint argument to
the internal build function allows for introspecting the type of an
object to create tag names from in the case a type hint exists. The
number of calls to type
is probably higher than any
other code I've written, which gives me a bit of pause but I think
it isn't necessarily bad given the job it is performing. In the case
there is no type annotation I also fallback to "item" rather than
the list element type, so as to normalize expected outputs.
import typing
import dataclasses
import xml.dom.minidom
def to_xml(obj):
def build(parent, obj, type_hint=None):
if dataclasses.is_dataclass(obj):
for key, value in dataclasses.asdict(obj).items():
tag = document.createElement(key)
parent.appendChild(tag)
build(tag, value, typing.get_type_hints(type(obj)).get(key))
elif isinstance(obj, list):
elem_type = None
if type_hint and hasattr(type_hint, '__args__'):
elem_type = type_hint.__args__[0]
tag_name = elem_type.__name__.lower()
else:
tag_name = 'value'
for elem in obj:
tag = document.createElement(tag_name)
parent.appendChild(tag)
build(tag, elem, elem_type)
elif isinstance(obj, dict):
for key, value in obj.items():
tag = document.createElement(key)
parent.appendChild(tag)
build(tag, value, typing.get_type_hints(type(obj)).get(key))
else:
data = str(obj)
tag = document.createTextNode(data)
parent.appendChild(tag)
document = xml.dom.minidom.Document()
root_tag_name = type(obj).__name__.lower()
root = document.createElement(root_tag_name)
document.appendChild(root)
build(root, obj, typing.get_type_hints(type(obj)).get('obj'))
return document
I'm not certain it is the most robust code for handling arbitrary input but it has been working surprisingly well in my testing. On the problem input from before:
... d = {'some_list': [1,2,3]}
... print(to_xml(d).toprettyxml(indent=" "))
<?xml version="1.0" ?>
<dict>
<some_list>
<value>1</value>
<value>2</value>
<value>3</value>
</some_list>
</dict>
... e = {'some_list': [1,2,3.0]}
... print(to_xml(e).toprettyxml(indent=" "))
<?xml version="1.0" ?>
<dict>
<some_list>
<value>1</value>
<value>2</value>
<value>3.0</value>
</some_list>
</dict>
Still not the best on arbitrary dictionaries but at least the tag
names don't vary. I think though it starts to shine when using with
slightly better practices for handling types. If I would have used
such a list of different types in a dataclass previously the type
would have been something like: some_list: [int|float]
which notably doesn't work with these recent changes1
because the Union type2 doesn't have
a __name__
to derive the tag name. If however I
expend slightly more thought I would probably pick a name
for the type that better describes the domain. That might look like
this (excusing the invented examples):
class Quality(enum.StrEnum):
POOR = enum.auto()
FAIR = enum.auto()
GOOD = enum.auto()
EXCELLENT = enum.auto()
type Age = int | float
@dataclasses.dataclass
class Example:
ages: list[Age]
quality: Quality
document = Example(ages=[1,2,3.0,4], quality=Quality.FAIR)
doc = to_xml(document)
<?xml version="1.0" ?>
<example>
<ages>
<age>1</age>
<age>2</age>
<age>3.0</age>
<age>4</age>
</ages>
<quality>fair</quality>
</example>
I would consider this to still be a bit rough and ready but when
combined with the idea to use single dispatch as an extension
mechanism I can start to see some real potential. Instead of
using str
in the final else clause of
the build
function it could invoke a single dispatch
function over as-yet-undefined types.
Here it is functionally unchanged:
@functools.singledispatch
def to_serializable(value):
return str(value)
def to_xml(obj):
def build(parent, obj, type_hint=None):
if dataclasses.is_dataclass(obj):
for key, value in dataclasses.asdict(obj).items():
tag = document.createElement(key)
parent.appendChild(tag)
build(tag, value, typing.get_type_hints(type(obj)).get(key))
elif isinstance(obj, list):
elem_type = None
if type_hint and hasattr(type_hint, '__args__'):
elem_type = type_hint.__args__[0]
tag_name = elem_type.__name__.lower()
else:
tag_name = 'value'
for elem in obj:
tag = document.createElement(tag_name)
parent.appendChild(tag)
build(tag, elem, elem_type)
elif isinstance(obj, dict):
for key, value in obj.items():
tag = document.createElement(key)
parent.appendChild(tag)
build(tag, value, typing.get_type_hints(type(obj)).get(key))
else:
data = to_serializable(obj)
tag = document.createTextNode(data)
parent.appendChild(tag)
document = xml.dom.minidom.Document()
root_tag_name = type(obj).__name__.lower()
root = document.createElement(root_tag_name)
document.appendChild(root)
build(root, obj, typing.get_type_hints(type(obj)).get('obj'))
return document
This allows for arbitrary extension methods without requiring
changes to the to_xml
function itself. Here's a
previous example for defining a datetime serializer:
@to_serializable.register
def ts_datetime(value: datetime.datetime):
return value.isoformat()
I remain a bit of a type skeptic when writing Python. The opt-in nature and ergonomics (they continue to feel bolted-on) leaves me less than thrilled. Additionally, the repeated experience with other people leaping to use every feature of the type checker as some kind of mental self-gratification exercise continues to be off-putting. That isn't entirely the fault of the language and I'm willing to be convinced so long as genuinely useful cases like this crop up.
I said something similar last time but I really think I'm close to done with this idea. There's just not that much cause to use XML! I have this horrible inclination to "finish" by realizing some way to include attributes into the serialized nodes, that would unlock the potential to do things like trying to declaratively match the output of all sorts of other tools. Not today though!