SP/web2py/gluon/utf8.py
Saturneic 064f602b1a Add.
2018-10-25 23:33:13 +08:00

759 lines
30 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
| This file is part of the web2py Web Framework
| Copyrighted by Massimo Di Pierro <mdipierro@cs.depaul.edu>
| License: LGPLv3 (http://www.gnu.org/licenses/lgpl.html)
| Created by Vladyslav Kozlovskyy (Ukraine) <dbdevelop©gmail.com>
| for Web2py project
Utilities and class for UTF8 strings managing
----------------------------------------------
"""
from __future__ import print_function
from gluon._compat import builtin as __builtin__, unicodeT, iteritems, to_unicode, to_native, reload
__all__ = ['Utf8']
repr_escape_tab = {}
#FIXME PY3
for i in range(1, 32):
repr_escape_tab[i] = to_unicode("\\"+"x%02x" % i)
repr_escape_tab[7] = u'\\a'
repr_escape_tab[8] = u'\\b'
repr_escape_tab[9] = u'\\t'
repr_escape_tab[10] = u'\\n'
repr_escape_tab[11] = u'\\v'
repr_escape_tab[12] = u'\\f'
repr_escape_tab[13] = u'\\r'
repr_escape_tab[ord('\\')] = u'\\\\'
repr_escape_tab2 = repr_escape_tab.copy()
repr_escape_tab2[ord('\'')] = u"\\'"
def sort_key(s):
"""Unicode Collation Algorithm (UCA) (http://www.unicode.org/reports/tr10/)
is used for utf-8 and unicode strings sorting and for utf-8 strings
comparison
Note:
pyuca is a very memory cost module! It loads the whole
"allkey.txt" file (~2mb!) into the memory. But this
functionality is needed only when sort_key() is called as a
part of sort() function or when Utf8 strings are compared.
So, it is a lazy "sort_key" function which (ONLY ONCE, ON ITS
FIRST CALL) imports pyuca and replaces itself with a real
sort_key() function
"""
global sort_key
try:
from gluon.contrib.pyuca import unicode_collator
unicode_sort_key = unicode_collator.sort_key
sort_key = lambda s: unicode_sort_key(
to_unicode(s, 'utf-8') if isinstance(s, str) else s)
except:
sort_key = lambda s: (
to_unicode(s, 'utf-8') if isinstance(s, str) else s).lower()
return sort_key(s)
def ord(char):
"""Returns unicode id for utf8 or unicode *char* character
SUPPOSE that *char* is an utf-8 or unicode character only
"""
if isinstance(char, unicodeT):
return __builtin__.ord(char)
return __builtin__.ord(to_unicode(char, 'utf-8'))
def chr(code):
"""Returns utf8-character with *code* unicode id """
return Utf8(unichr(code))
def size(string):
"""Returns length of utf-8 string in bytes
Note:
The length of correspondent utf-8 string is returned for unicode string
"""
return Utf8(string).__size__()
def truncate(string, length, dots='...'):
"""Returns string of length < *length* or truncate string with adding
*dots* suffix to the string's end
Args:
length (int): max length of string
dots (str or unicode): string suffix, when string is cutted
Returns:
(utf8-str): original or cutted string
"""
text = to_unicode(string, 'utf-8')
dots = to_unicode(dots, 'utf-8') if isinstance(dots, str) else dots
if len(text) > length:
text = text[:length - len(dots)] + dots
return str.__new__(Utf8, text.encode('utf-8'))
class Utf8(str):
"""
Class for utf8 string storing and manipulations
The base presupposition of this class usage is:
"ALL strings in the application are either of
utf-8 or unicode type, even when simple str
type is used. UTF-8 is only a "packed" version
of unicode, so Utf-8 and unicode strings are
interchangeable."
CAUTION! This class is slower than str/unicode!
Do NOT use it inside intensive loops. Simply
decode string(s) to unicode before loop and
encode it back to utf-8 string(s) after
intensive calculation.
You can see the benefit of this class in doctests() below
"""
def __new__(cls, content='', codepage='utf-8'):
if isinstance(content, unicodeT):
return str.__new__(cls, to_native(content, 'utf-8'))
elif codepage in ('utf-8', 'utf8') or isinstance(content, cls):
return str.__new__(cls, content)
else:
return str.__new__(cls, to_native(to_unicode(content, codepage), 'utf-8'))
def __repr__(self):
r''' # note that we use raw strings to avoid having to use double back slashes below
NOTE! This function is a clone of web2py:gluon.languages.utf_repl() function::
utf8.__repr__() works same as str.repr() when processing ascii string
>>> repr(Utf8('abc')) == repr(Utf8("abc")) == repr('abc') == repr("abc") == "'abc'"
True
>>> repr(Utf8('a"b"c')) == repr('a"b"c') == '\'a"b"c\''
True
>>> repr(Utf8("a'b'c")) == repr("a'b'c") == '"a\'b\'c"'
True
>>> repr(Utf8('a\'b"c')) == repr('a\'b"c') == repr(Utf8("a'b\"c")) == repr("a'b\"c") == '\'a\\\'b"c\''
True
>>> repr(Utf8('a\r\nb')) == repr('a\r\nb') == "'a\\r\\nb'" # Test for \r, \n
True
Unlike str.repr(), Utf8.__repr__() remains utf8 content when processing utf8 string::
>>> repr(Utf8('中文字')) == repr(Utf8("中文字")) == "'中文字'" != repr('中文字')
True
>>> repr(Utf8('""')) == "'\"\"'" != repr('""')
True
>>> repr(Utf8("''")) == '"\'\'"' != repr("''")
True
>>> repr(Utf8('\'"')) == repr(Utf8("'\"")) == '\'\\\'"\'' != repr('\'"') == repr("'\"")
True
>>> repr(Utf8('\r\n文')) == "'\\r\\n文'" != repr('\r\n文') # Test for \r, \n
True
'''
if str.find(self, "'") >= 0 and str.find(self, '"') < 0: # only single quote exists
return '"' + to_native(to_unicode(self, 'utf-8').translate(repr_escape_tab), 'utf-8') + '"'
else:
return "'" + to_native(to_unicode(self, 'utf-8').translate(repr_escape_tab2), 'utf-8') + "'"
def __size__(self):
""" length of utf-8 string in bytes """
return str.__len__(self)
def __contains__(self, other):
return str.__contains__(self, Utf8(other))
def __getitem__(self, index):
return str.__new__(Utf8, to_native(to_unicode(self, 'utf-8')[index], 'utf-8'))
def __getslice__(self, begin, end):
return str.__new__(Utf8, to_native(to_unicode(self, 'utf-8')[begin:end], 'utf-8'))
def __add__(self, other):
return str.__new__(Utf8, str.__add__(self, unicode.encode(other, 'utf-8')
if isinstance(other, unicode) else other))
def __len__(self):
return len(to_unicode(self, 'utf-8'))
def __mul__(self, integer):
return str.__new__(Utf8, str.__mul__(self, integer))
def __eq__(self, string):
return str.__eq__(self, Utf8(string))
def __ne__(self, string):
return str.__ne__(self, Utf8(string))
def capitalize(self):
return str.__new__(Utf8, unicode(self, 'utf-8').capitalize().encode('utf-8'))
def center(self, length):
return str.__new__(Utf8, unicode(self, 'utf-8').center(length).encode('utf-8'))
def upper(self):
return str.__new__(Utf8, unicode(self, 'utf-8').upper().encode('utf-8'))
def lower(self):
return str.__new__(Utf8, unicode(self, 'utf-8').lower().encode('utf-8'))
def title(self):
return str.__new__(Utf8, unicode(self, 'utf-8').title().encode('utf-8'))
def index(self, string):
return unicode(self, 'utf-8').index(string if isinstance(string, unicode) else unicode(string, 'utf-8'))
def isalnum(self):
return unicode(self, 'utf-8').isalnum()
def isalpha(self):
return unicode(self, 'utf-8').isalpha()
def isdigit(self):
return unicode(self, 'utf-8').isdigit()
def islower(self):
return unicode(self, 'utf-8').islower()
def isspace(self):
return unicode(self, 'utf-8').isspace()
def istitle(self):
return unicode(self, 'utf-8').istitle()
def isupper(self):
return unicode(self, 'utf-8').isupper()
def zfill(self, length):
return str.__new__(Utf8, unicode(self, 'utf-8').zfill(length).encode('utf-8'))
def join(self, iter):
return str.__new__(Utf8, str.join(self, [Utf8(c) for c in
list(unicode(iter, 'utf-8') if
isinstance(iter, str) else
iter)]))
def lstrip(self, chars=None):
return str.__new__(Utf8, str.lstrip(self, None if chars is None else Utf8(chars)))
def rstrip(self, chars=None):
return str.__new__(Utf8, str.rstrip(self, None if chars is None else Utf8(chars)))
def strip(self, chars=None):
return str.__new__(Utf8, str.strip(self, None if chars is None else Utf8(chars)))
def swapcase(self):
return str.__new__(Utf8, unicode(self, 'utf-8').swapcase().encode('utf-8'))
def count(self, sub, start=0, end=None):
unistr = unicode(self, 'utf-8')
return unistr.count(
unicode(sub, 'utf-8') if isinstance(sub, str) else sub,
start, len(unistr) if end is None else end)
def decode(self, encoding='utf-8', errors='strict'):
return str.decode(self, encoding, errors)
def encode(self, encoding, errors='strict'):
return unicode(self, 'utf-8').encode(encoding, errors)
def expandtabs(self, tabsize=8):
return str.__new__(Utf8, unicode(self, 'utf-8').expandtabs(tabsize).encode('utf-8'))
def find(self, sub, start=None, end=None):
return unicode(self, 'utf-8').find(unicode(sub, 'utf-8')
if isinstance(sub, str) else sub, start, end)
def ljust(self, width, fillchar=' '):
return str.__new__(Utf8, unicode(self, 'utf-8').ljust(width, unicode(fillchar, 'utf-8')
if isinstance(fillchar, str) else fillchar).encode('utf-8'))
def partition(self, sep):
(head, sep, tail) = str.partition(self, Utf8(sep))
return (str.__new__(Utf8, head),
str.__new__(Utf8, sep),
str.__new__(Utf8, tail))
def replace(self, old, new, count=-1):
return str.__new__(Utf8, str.replace(self, Utf8(old), Utf8(new), count))
def rfind(self, sub, start=None, end=None):
return unicode(self, 'utf-8').rfind(unicode(sub, 'utf-8')
if isinstance(sub, str) else sub, start, end)
def rindex(self, string):
return unicode(self, 'utf-8').rindex(string if isinstance(string, unicode)
else unicode(string, 'utf-8'))
def rjust(self, width, fillchar=' '):
return str.__new__(Utf8, unicode(self, 'utf-8').rjust(width, unicode(fillchar, 'utf-8')
if isinstance(fillchar, str) else fillchar).encode('utf-8'))
def rpartition(self, sep):
(head, sep, tail) = str.rpartition(self, Utf8(sep))
return (str.__new__(Utf8, head),
str.__new__(Utf8, sep),
str.__new__(Utf8, tail))
def rsplit(self, sep=None, maxsplit=-1):
return [str.__new__(Utf8, part) for part in str.rsplit(self,
None if sep is None else Utf8(sep), maxsplit)]
def split(self, sep=None, maxsplit=-1):
return [str.__new__(Utf8, part) for part in str.split(self,
None if sep is None else Utf8(sep), maxsplit)]
def splitlines(self, keepends=False):
return [str.__new__(Utf8, part) for part in str.splitlines(self, keepends)]
def startswith(self, prefix, start=0, end=None):
unistr = unicode(self, 'utf-8')
if isinstance(prefix, tuple):
prefix = tuple(unicode(
s, 'utf-8') if isinstance(s, str) else s for s in prefix)
elif isinstance(prefix, str):
prefix = unicode(prefix, 'utf-8')
return unistr.startswith(prefix, start, len(unistr) if end is None else end)
def translate(self, table, deletechars=''):
if isinstance(table, dict):
return str.__new__(Utf8, unicode(self, 'utf-8').translate(table).encode('utf-8'))
else:
return str.__new__(Utf8, str.translate(self, table, deletechars))
def endswith(self, prefix, start=0, end=None):
unistr = unicode(self, 'utf-8')
if isinstance(prefix, tuple):
prefix = tuple(unicode(
s, 'utf-8') if isinstance(s, str) else s for s in prefix)
elif isinstance(prefix, str):
prefix = unicode(prefix, 'utf-8')
return unistr.endswith(prefix, start, len(unistr) if end is None else end)
if hasattr(str, 'format'): # Python 2.5 hasn't got str.format() method
def format(self, *args, **kwargs):
args = [unicode(
s, 'utf-8') if isinstance(s, str) else s for s in args]
kwargs = dict((unicode(k, 'utf-8') if isinstance(k, str) else k,
unicode(v, 'utf-8') if isinstance(v, str) else v)
for k, v in iteritems(kwargs))
return str.__new__(Utf8, unicode(self, 'utf-8').format(*args, **kwargs).encode('utf-8'))
def __mod__(self, right):
if isinstance(right, tuple):
right = tuple(unicode(v, 'utf-8') if isinstance(v, str) else v
for v in right)
elif isinstance(right, dict):
right = dict((unicode(k, 'utf-8') if isinstance(k, str) else k,
unicode(v, 'utf-8') if isinstance(v, str) else v)
for k, v in iteritems(right))
elif isinstance(right, str):
right = unicode(right, 'utf-8')
return str.__new__(Utf8, unicode(self, 'utf-8').__mod__(right).encode('utf-8'))
def __ge__(self, string):
return sort_key(self) >= sort_key(string)
def __gt__(self, string):
return sort_key(self) > sort_key(string)
def __le__(self, string):
return sort_key(self) <= sort_key(string)
def __lt__(self, string):
return sort_key(self) < sort_key(string)
if __name__ == '__main__':
def doctests():
u"""
doctests:
>>> test_unicode=u'ПРоба Є PRobe'
>>> test_unicode_word=u'ПРоба'
>>> test_number_str='12345'
>>> test_unicode
u'\\u041f\\u0420\\u043e\\u0431\\u0430 \\u0404 PRobe'
>>> print test_unicode
ПРоба Є PRobe
>>> test_word=test_unicode_word.encode('utf-8')
>>> test_str=test_unicode.encode('utf-8')
>>> s=Utf8(test_str)
>>> s
'ПРоба Є PRobe'
>>> type(s)
<class '__main__.Utf8'>
>>> s == test_str
True
>>> len(test_str) # wrong length of utf8-string!
19
>>> len(test_unicode) # RIGHT!
13
>>> len(s) # RIGHT!
13
>>> size(test_str) # size of utf-8 string (in bytes) == len(str)
19
>>> size(test_unicode) # size of unicode string in bytes (packed to utf-8 string)
19
>>> size(s) # size of utf-8 string in bytes
19
>>> try: # utf-8 is a multibyte string. Convert it to unicode for use with builtin ord()
... __builtin__.ord('б') # ascii string
... except Exception, e:
... print 'Exception:', e
Exception: ord() expected a character, but string of length 2 found
>>> ord('б') # utf8.ord() is used(!!!)
1073
>>> ord(u'б') # utf8.ord() is used(!!!)
1073
>>> ord(s[3]) # utf8.ord() is used(!!!)
1073
>>> chr(ord(s[3])) # utf8.chr() and utf8.chr() is used(!!!)
'б'
>>> type(chr(1073)) # utf8.chr() is used(!!!)
<class '__main__.Utf8'>
>>> s=Utf8(test_unicode)
>>> s
'ПРоба Є PRobe'
>>> s == test_str
True
>>> test_str == s
True
>>> s == test_unicode
True
>>> test_unicode == s
True
>>> print test_str.upper() # only ASCII characters uppered
ПРоба Є PROBE
>>> print test_unicode.upper() # unicode gives right result
ПРОБА Є PROBE
>>> s.upper() # utf8 class use unicode.upper()
'ПРОБА Є PROBE'
>>> type(s.upper())
<class '__main__.Utf8'>
>>> s.lower()
'проба є probe'
>>> type(s.lower())
<class '__main__.Utf8'>
>>> s.capitalize()
'Проба є probe'
>>> type(s.capitalize())
<class '__main__.Utf8'>
>>> len(s)
13
>>> len(test_unicode)
13
>>> s+'. Probe is проба'
'ПРоба Є PRobe. Probe is проба'
>>> type(s+'. Probe is проба')
<class '__main__.Utf8'>
>>> s+u'. Probe is проба'
'ПРоба Є PRobe. Probe is проба'
>>> type(s+u'. Probe is проба')
<class '__main__.Utf8'>
>>> s+s
'ПРоба Є PRobeПРоба Є PRobe'
>>> type(s+s)
<class '__main__.Utf8'>
>>> a=s
>>> a+=s
>>> a+=test_unicode
>>> a+=test_str
>>> a
'ПРоба Є PRobeПРоба Є PRobeПРоба Є PRobeПРоба Є PRobe'
>>> type(a)
<class '__main__.Utf8'>
>>> s*3
'ПРоба Є PRobeПРоба Є PRobeПРоба Є PRobe'
>>> type(s*3)
<class '__main__.Utf8'>
>>> a=Utf8("-проба-")
>>> a*=10
>>> a
'-проба--проба--проба--проба--проба--проба--проба--проба--проба--проба-'
>>> type(a)
<class '__main__.Utf8'>
>>> print "'"+test_str.center(17)+"'" # WRONG RESULT!
'ПРоба Є PRobe'
>>> s.center(17) # RIGHT!
' ПРоба Є PRobe '
>>> type(s.center(17))
<class '__main__.Utf8'>
>>> (test_word+test_number_str).isalnum() # WRONG RESULT! non ASCII chars are detected as non alpha
False
>>> Utf8(test_word+test_number_str).isalnum()
True
>>> s.isalnum()
False
>>> test_word.isalpha() # WRONG RESULT! Non ASCII characters are detected as non alpha
False
>>> Utf8(test_word).isalpha() # RIGHT!
True
>>> s.lower().islower()
True
>>> s.upper().isupper()
True
>>> print test_str.zfill(17) # WRONG RESULT!
ПРоба Є PRobe
>>> s.zfill(17) # RIGHT!
'0000ПРоба Є PRobe'
>>> type(s.zfill(17))
<class '__main__.Utf8'>
>>> s.istitle()
False
>>> s.title().istitle()
True
>>> Utf8('1234').isdigit()
True
>>> Utf8(' \t').isspace()
True
>>> s.join('•|•')
'•ПРоба Є PRobe|ПРоба Є PRobe•'
>>> s.join((str('(utf8 тест1)'), unicode('(unicode тест2)','utf-8'), '(ascii test3)'))
'(utf8 тест1)ПРоба Є PRobe(unicode тест2)ПРоба Є PRobe(ascii test3)'
>>> type(s)
<class '__main__.Utf8'>
>>> s==test_str
True
>>> s==test_unicode
True
>>> s.swapcase()
'прОБА є prOBE'
>>> type(s.swapcase())
<class '__main__.Utf8'>
>>> truncate(s, 10)
'ПРоба Є...'
>>> truncate(s, 20)
'ПРоба Є PRobe'
>>> truncate(s, 10, '•••') # utf-8 string as *dots*
'ПРоба Є•••'
>>> truncate(s, 10, u'®') # you can use unicode string as *dots*
'ПРоба Є P®'
>>> type(truncate(s, 10))
<class '__main__.Utf8'>
>>> Utf8(s.encode('koi8-u'), 'koi8-u')
'ПРоба Є PRobe'
>>> s.decode() # convert utf-8 string to unicode
u'\\u041f\\u0420\\u043e\\u0431\\u0430 \\u0404 PRobe'
>>> a='про\\tba'
>>> str_tmp=a.expandtabs()
>>> utf8_tmp=Utf8(a).expandtabs()
>>> utf8_tmp.replace(' ','.') # RIGHT! (default tabsize is 8)
'про.....ba'
>>> utf8_tmp.index('b')
8
>>> print "'"+str_tmp.replace(' ','.')+"'" # WRONG STRING LENGTH!
'про..ba'
>>> str_tmp.index('b') # WRONG index of 'b' character
8
>>> print "'"+a.expandtabs(4).replace(' ','.')+"'" # WRONG RESULT!
'про..ba'
>>> Utf8(a).expandtabs(4).replace(' ','.') # RIGHT!
'про.ba'
>>> s.find('Є')
6
>>> s.find(u'Є')
6
>>> s.find(' ', 6)
7
>>> s.rfind(' ')
7
>>> s.partition('Є')
('ПРоба ', 'Є', ' PRobe')
>>> s.partition(u'Є')
('ПРоба ', 'Є', ' PRobe')
>>> (a,b,c) = s.partition('Є')
>>> type(a), type(b), type(c)
(<class '__main__.Utf8'>, <class '__main__.Utf8'>, <class '__main__.Utf8'>)
>>> s.partition(' ')
('ПРоба', ' ', 'Є PRobe')
>>> s.rpartition(' ')
('ПРоба Є', ' ', 'PRobe')
>>> s.index('Є')
6
>>> s.rindex(u'Є')
6
>>> s.index(' ')
5
>>> s.rindex(' ')
7
>>> a=Utf8('а б ц д е а б ц д е а\\tб ц д е')
>>> a.split()
['а', 'б', 'ц', 'д', 'е', 'а', 'б', 'ц', 'д', 'е', 'а', 'б', 'ц', 'д', 'е']
>>> a.rsplit()
['а', 'б', 'ц', 'д', 'е', 'а', 'б', 'ц', 'д', 'е', 'а', 'б', 'ц', 'д', 'е']
>>> a.expandtabs().split('б')
['а ', ' ц д е а ', ' ц д е а ', ' ц д е']
>>> a.expandtabs().rsplit('б')
['а ', ' ц д е а ', ' ц д е а ', ' ц д е']
>>> a.expandtabs().split(u'б', 1)
['а ', ' ц д е а б ц д е а б ц д е']
>>> a.expandtabs().rsplit(u'б', 1)
['а б ц д е а б ц д е а ', ' ц д е']
>>> a=Utf8("рядок1\\nрядок2\\nрядок3")
>>> a.splitlines()
['рядок1', 'рядок2', 'рядок3']
>>> a.splitlines(True)
['рядок1\\n', 'рядок2\\n', 'рядок3']
>>> s[6]
'Є'
>>> s[0]
'П'
>>> s[-1]
'e'
>>> s[:10]
'ПРоба Є PR'
>>> s[2:-2:2]
'оаЄPo'
>>> s[::-1]
'eboRP Є абоРП'
>>> s.startswith('ПР')
True
>>> s.startswith(('ПР', u'об'),0)
True
>>> s.startswith(u'об', 2, 4)
True
>>> s.endswith('be')
True
>>> s.endswith(('be', 'PR', u'Є'))
True
>>> s.endswith('PR', 8, 10)
True
>>> s.endswith('Є', -7, -6)
True
>>> s.count(' ')
2
>>> s.count(' ',6)
1
>>> s.count(u'Є')
1
>>> s.count('Є', 0, 5)
0
>>> Utf8("Parameters: '%(проба)s', %(probe)04d, %(проба2)s") % { u"проба": s,
... "not used": "???", "probe": 2, "проба2": u"ПРоба Probe" }
"Parameters: 'ПРоба Є PRobe', 0002, ПРоба Probe"
>>> a=Utf8(u"Параметр: (%s)-(%s)-[%s]")
>>> a%=(s, s[::-1], 1000)
>>> a
'Параметр: (ПРоба Є PRobe)-(eboRP Є абоРП)-[1000]'
>>> if hasattr(Utf8, 'format'):
... Utf8("Проба <{0}>, {1}, {param1}, {param2}").format(s, u"中文字",
... param1="барабан", param2=1000) == 'Проба <ПРоба Є PRobe>, 中文字, барабан, 1000'
... else: # format() method is not used in python with version <2.6:
... print True
True
>>> u'Б'<u'Ї' # WRONG ORDER!
False
>>> 'Б'<'Ї' # WRONG ORDER!
False
>>> Utf8('Б')<'Ї' # RIGHT!
True
>>> u'д'>u'ґ' # WRONG ORDER!
False
>>> Utf8('д')>Utf8('ґ') # RIGHT!
True
>>> u'є'<=u'ж' # WRONG ORDER!
False
>>> Utf8('є')<=u'ж' # RIGHT!
True
>>> Utf8('є')<=u'є'
True
>>> u'Ї'>=u'И' # WRONG ORDER!
False
>>> Utf8(u'Ї') >= u'И' # RIGHT
True
>>> Utf8('Є') >= 'Є'
True
>>> a="яжертиуіопшщїасдфгґхйклчєзьцвбнмюЯЖЕРТИУІОПШЩЇАСДФГҐХЙКЛЧЗЬЦВБНМЮЄ" # str type
>>> b=u"яжертиуіопшщїасдфгґхйклчєзьцвбнмюЯЖЕРТИУІОПШЩЇАСДФГҐХЙКЛЧЗЬЦВБНМЮЄ" # unicode type
>>> c=Utf8("яжертиуіопшщїасдфгґхйклчєзьцвбнмюЯЖЕРТИУІОПШЩЇАСДФГҐХЙКЛЧЗЬЦВБНМЮЄ") # utf8 class
>>> result = "".join(sorted(a))
>>> result[0:20] # result is not utf8 string, because bytes, not utf8-characters were sorted
'\\x80\\x81\\x82\\x83\\x84\\x84\\x85\\x86\\x86\\x87\\x87\\x88\\x89\\x8c\\x8e\\x8f\\x90\\x90\\x91\\x91'
>>> try:
... unicode(result, 'utf-8') # try to convert result (utf-8?) to unicode
... except Exception, e:
... print 'Exception:', e
Exception: 'utf8' codec can't decode byte 0x80 in position 0: unexpected code byte
>>> try: # FAILED! (working with bytes, not with utf8-charactes)
... "".join( sorted(a, key=sort_key) ) # utf8.sort_key may be used with utf8 or unicode strings only!
... except Exception, e:
... print 'Exception:', e
Exception: 'utf8' codec can't decode byte 0xd1 in position 0: unexpected end of data
>>> print "".join( sorted(Utf8(a))) # converting *a* to unicode or utf8-string gives us correct result
аАбБвВгГґҐдДеЕєЄжЖзЗиИіІїЇйЙкКлЛмМнНоОпПрРсСтТуУфФхХцЦчЧшШщЩьЬюЮяЯ
>>> print u"".join( sorted(b) ) # WRONG ORDER! Default sort key is used
ЄІЇАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЬЮЯабвгдежзийклмнопрстуфхцчшщьюяєіїҐґ
>>> print u"".join( sorted(b, key=sort_key) ) # RIGHT ORDER! utf8.sort_key is used
аАбБвВгГґҐдДеЕєЄжЖзЗиИіІїЇйЙкКлЛмМнНоОпПрРсСтТуУфФхХцЦчЧшШщЩьЬюЮяЯ
>>> print "".join( sorted(c) ) # RIGHT ORDER! Utf8 "rich comparison" methods are used
аАбБвВгГґҐдДеЕєЄжЖзЗиИіІїЇйЙкКлЛмМнНоОпПрРсСтТуУфФхХцЦчЧшШщЩьЬюЮяЯ
>>> print "".join( sorted(c, key=sort_key) ) # RIGHT ORDER! utf8.sort_key is used
аАбБвВгГґҐдДеЕєЄжЖзЗиИіІїЇйЙкКлЛмМнНоОпПрРсСтТуУфФхХцЦчЧшШщЩьЬюЮяЯ
>>> Utf8().join(sorted(c.decode(), key=sort_key)) # convert to unicode for better performance
'аАбБвВгГґҐдДеЕєЄжЖзЗиИіІїЇйЙкКлЛмМнНоОпПрРсСтТуУфФхХцЦчЧшШщЩьЬюЮяЯ'
>>> for result in sorted(["Іа", "Астро", u"гала", Utf8("Гоша"), "Єва", "шовк", "аякс", "Їжа",
... "ґанок", Utf8("Дар'я"), "білінг", "веб", u"Жужа", "проба", u"тест",
... "абетка", "яблуко", "Юляся", "Київ", "лимонад", "ложка", "Матриця",
... ], key=sort_key):
... print result.ljust(20), type(result)
абетка <type 'str'>
Астро <type 'str'>
аякс <type 'str'>
білінг <type 'str'>
веб <type 'str'>
гала <type 'unicode'>
ґанок <type 'str'>
Гоша <class '__main__.Utf8'>
Дар'я <class '__main__.Utf8'>
Єва <type 'str'>
Жужа <type 'unicode'>
Іа <type 'str'>
Їжа <type 'str'>
Київ <type 'str'>
лимонад <type 'str'>
ложка <type 'str'>
Матриця <type 'str'>
проба <type 'str'>
тест <type 'unicode'>
шовк <type 'str'>
Юляся <type 'str'>
яблуко <type 'str'>
>>> a=Utf8("中文字")
>>> L=list(a)
>>> L
['', '', '']
>>> a="".join(L)
>>> print a
中文字
>>> type(a)
<type 'str'>
>>> a="中文字" # standard str type
>>> L=list(a)
>>> L
['\\xe4', '\\xb8', '\\xad', '\\xe6', '\\x96', '\\x87', '\\xe5', '\\xad', '\\x97']
>>> from string import maketrans
>>> str_tab=maketrans('PRobe','12345')
>>> unicode_tab={ord(u'П'):ord(u'Ж'),
... ord(u'Р') : u'Ш',
... ord(Utf8('о')) : None, # utf8.ord() is used
... ord('б') : None, # -//-//-
... ord(u'а') : u"中文字",
... ord(u'Є') : Utf8('').decode(), # only unicode type is supported
... }
>>> s.translate(unicode_tab).translate(str_tab, deletechars=' ')
'ЖШ中文字•12345'
"""
import sys
reload(sys)
sys.setdefaultencoding("UTF-8")
import doctest
print("DOCTESTS STARTED...")
doctest.testmod()
print("DOCTESTS FINISHED")
doctests()