Skip to content

Commit f1e3d62

Browse files
committed
Merge pull request #224 from CybOXProject/unicode
More unicode and encoding support.
2 parents cf78bb3 + 9049211 commit f1e3d62

File tree

4 files changed

+80
-37
lines changed

4 files changed

+80
-37
lines changed

cybox/__init__.py

Lines changed: 28 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import json
99
from StringIO import StringIO
1010

11+
import cybox.bindings as bindings
1112
import cybox.utils.idgen
1213
from cybox.utils import Namespace, META
1314

@@ -233,8 +234,12 @@ def from_dict(cls, cls_dict=None):
233234
return entity
234235

235236
def to_xml(self, include_namespaces=True, namespace_dict=None,
236-
pretty=True):
237-
"""Export an object as an XML String.
237+
pretty=True, encoding='utf-8'):
238+
"""Serializes a :class:`Entity` instance to an XML string.
239+
240+
The default character encoding is ``utf-8`` and can be set via the
241+
`encoding` parameter. If `encoding` is ``None``, a unicode string
242+
is returned.
238243
239244
Args:
240245
include_namespaces (bool): whether to include xmlns and
@@ -244,9 +249,13 @@ def to_xml(self, include_namespaces=True, namespace_dict=None,
244249
prefixes
245250
pretty (bool): whether to produce readable (``True``) or compact
246251
(``False``) output. Defaults to ``True``.
252+
encoding: The output character encoding. Default is ``utf-8``. If
253+
`encoding` is set to ``None``, a unicode string is returned.
247254
248255
Returns:
249-
XML string
256+
An XML string for this
257+
:class:`Entity` instance. Default character encoding is ``utf-8``.
258+
250259
"""
251260
namespace_def = ""
252261

@@ -256,10 +265,22 @@ def to_xml(self, include_namespaces=True, namespace_dict=None,
256265
if not pretty:
257266
namespace_def = namespace_def.replace('\n\t', ' ')
258267

259-
s = StringIO()
260-
self.to_obj().export(s.write, 0, namespacedef_=namespace_def,
261-
pretty_print=pretty)
262-
return s.getvalue().strip()
268+
269+
with bindings.save_encoding(encoding):
270+
sio = StringIO()
271+
self.to_obj().export(
272+
sio.write,
273+
0,
274+
namespacedef_=namespace_def,
275+
pretty_print=pretty
276+
)
277+
278+
s = unicode(sio.getvalue()).strip()
279+
280+
if encoding:
281+
return s.encode(encoding)
282+
283+
return s
263284

264285
def to_json(self):
265286
"""Export an object as a JSON String."""

cybox/bindings/__init__.py

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import base64
55
from datetime import datetime, tzinfo, timedelta
66
import re
7+
import contextlib
78

89
from xml.sax import saxutils
910
from lxml import etree as etree_
@@ -19,6 +20,18 @@
1920
_Tag_strip_pattern_ = re.compile(r'\{.*\}')
2021

2122

23+
@contextlib.contextmanager
24+
def save_encoding(encoding='utf-8'):
25+
global ExternalEncoding
26+
27+
try:
28+
orig_encoding = ExternalEncoding
29+
ExternalEncoding = encoding
30+
yield
31+
finally:
32+
ExternalEncoding = orig_encoding
33+
34+
2235
def parsexml_(*args, **kwargs):
2336
if 'parser' not in kwargs:
2437
# Use the lxml ElementTree compatible parser so that, e.g.,
@@ -258,7 +271,7 @@ def showIndent(lwrite, level, pretty_print=True):
258271

259272
def quote_xml(text):
260273
if text is None:
261-
return ''
274+
return u''
262275

263276
# Convert `text` to unicode string. This is mainly a catch-all for non
264277
# string/unicode types like bool and int.
@@ -267,9 +280,6 @@ def quote_xml(text):
267280
except UnicodeDecodeError:
268281
text = text.decode(ExternalEncoding)
269282

270-
# Convert unicode string to correct output character encoding.
271-
text = text.encode(ExternalEncoding)
272-
273283
# If it's a CDATA block, return the text as is.
274284
if text.startswith(CDATA_START):
275285
return text
@@ -281,7 +291,7 @@ def quote_xml(text):
281291

282292
def quote_attrib(text):
283293
if text is None:
284-
return '""' # Return an empty XML attribute value
294+
return u'""'
285295

286296
# Convert `text` to unicode string. This is mainly a catch-all for non
287297
# string/unicode types like bool and int.
@@ -290,9 +300,6 @@ def quote_attrib(text):
290300
except UnicodeDecodeError:
291301
text = text.decode(ExternalEncoding)
292302

293-
# Convert the unicode string to the correct output character encoding.
294-
text = text.encode(ExternalEncoding)
295-
296303
# Return the escaped the value of text.
297304
# Note: This wraps the escaped text in quotation marks.
298305
return saxutils.quoteattr(text)

cybox/test/__init__.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import json
55
import unittest
66

7+
import cybox.bindings as bindings
78
from cybox import Entity, EntityList, TypedField
89
import cybox.bindings.cybox_core as core_binding
910
from cybox.core import Observables
@@ -83,7 +84,10 @@ def round_trip(o, output=False, list_=False):
8384
xobj = o2.to_obj()
8485

8586
# 6. Bindings Object -> XML String
86-
xml_string = o2.to_xml(include_namespaces=True)
87+
xml_string = o2.to_xml(encoding=bindings.ExternalEncoding)
88+
89+
if not isinstance(xml_string, unicode):
90+
xml_string = xml_string.decode(bindings.ExternalEncoding)
8791

8892
if output:
8993
print(xml_string)

cybox/test/encoding_test.py

Lines changed: 32 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,6 @@ def test_whois(self):
5555

5656
def test_quote_xml(self):
5757
s = bindings.quote_xml(UNICODE_STR)
58-
s = s.decode(bindings.ExternalEncoding)
5958
self.assertEqual(s, UNICODE_STR)
6059

6160
def test_quote_attrib(self):
@@ -69,77 +68,89 @@ def test_quote_attrib(self):
6968
"""
7069
s = bindings.quote_attrib(UNICODE_STR)
7170
s = s[1:-1]
72-
s = s.decode(bindings.ExternalEncoding)
7371
self.assertEqual(s, UNICODE_STR)
7472

7573
def test_quote_attrib_int(self):
7674
i = 65536
7775
s = bindings.quote_attrib(i)
78-
s = s[1:-1]
79-
self.assertEqual(str(i), s)
76+
self.assertEqual(u'"65536"', s)
8077

8178
def test_quote_attrib_bool(self):
8279
b = True
8380
s = bindings.quote_attrib(b)
84-
s = s[1:-1]
85-
self.assertEqual(str(b), s)
81+
self.assertEqual(u'"True"', s)
8682

8783
def test_quote_xml_int(self):
8884
i = 65536
8985
s = bindings.quote_xml(i)
90-
self.assertEqual(str(i), s)
86+
self.assertEqual(unicode(i), s)
9187

9288
def test_quote_xml_bool(self):
9389
b = True
9490
s = bindings.quote_xml(b)
95-
self.assertEqual(str(b), s)
91+
self.assertEqual(unicode(b), s)
9692

9793
def test_quote_xml_encoded(self):
9894
encoding = bindings.ExternalEncoding
9995
encoded = UNICODE_STR.encode(encoding)
10096
quoted = bindings.quote_xml(encoded)
101-
decoded = quoted.decode(encoding)
102-
self.assertEqual(UNICODE_STR, decoded)
97+
self.assertEqual(UNICODE_STR, quoted)
10398

10499
def test_quote_attrib_encoded(self):
105100
encoding = bindings.ExternalEncoding
106101
encoded = UNICODE_STR.encode(encoding)
107102
quoted = bindings.quote_attrib(encoded)[1:-1]
108-
decoded = quoted.decode(encoding)
109-
self.assertEqual(UNICODE_STR, decoded)
103+
self.assertEqual(UNICODE_STR, quoted)
110104

111105
def test_quote_xml_zero(self):
112106
i = 0
113107
s = bindings.quote_xml(i)
114-
self.assertEqual(str(i), s)
108+
self.assertEqual(unicode(i), s)
115109

116110
def test_quote_attrib_zero(self):
117111
i = 0
118112
s = bindings.quote_attrib(i)
119-
s = s[1:-1]
120-
self.assertEqual(str(i), s)
113+
self.assertEqual(u'"0"', s)
121114

122115
def test_quote_xml_none(self):
123116
i = None
124117
s = bindings.quote_xml(i)
125-
self.assertEqual('', s)
118+
self.assertEqual(u'', s)
126119

127120
def test_quote_attrib_none(self):
128121
i = None
129122
s = bindings.quote_attrib(i)
130-
s = s[1:-1]
131-
self.assertEqual('', s)
123+
self.assertEqual(u'""', s)
132124

133125
def test_quote_attrib_empty(self):
134126
i = ''
135127
s = bindings.quote_attrib(i)
136-
s = s[1:-1]
137-
self.assertEqual('', s)
128+
self.assertEqual(u'""', s)
138129

139130
def test_quote_xml_empty(self):
140131
i = ''
141132
s = bindings.quote_xml(i)
142-
self.assertEqual('', s)
133+
self.assertEqual(u'', s)
134+
135+
def test_to_xml_utf16_encoded(self):
136+
encoding = 'utf-16'
137+
o = Observable()
138+
o.title = UNICODE_STR
139+
xml = o.to_xml(encoding=encoding)
140+
self.assertTrue(UNICODE_STR in xml.decode(encoding))
141+
142+
def test_to_xml_default_encoded(self):
143+
o = Observable()
144+
o.title = UNICODE_STR
145+
xml = o.to_xml()
146+
self.assertTrue(UNICODE_STR in xml.decode('utf-8'))
147+
148+
def test_to_xml_no_encoding(self):
149+
o = Observable()
150+
o.title = UNICODE_STR
151+
xml = o.to_xml(encoding=None)
152+
self.assertTrue(isinstance(xml, unicode))
153+
self.assertTrue(UNICODE_STR in xml)
143154

144155
if __name__ == "__main__":
145156
unittest.main()

0 commit comments

Comments
 (0)