759 lines
30 KiB
Python
759 lines
30 KiB
Python
|
#!/usr/bin/env python
|
|||
|
# -*- coding: utf-8 -*-
|
|||
|
"""
|
|||
|
| This file is part of the web2py Web Framework
|
|||
|
| Copyrighted by Massimo Di Pierro <mdipierro@cs.depaul.edu>
|
|||
|
| License: LGPLv3 (http://www.gnu.org/licenses/lgpl.html)
|
|||
|
| Created by Vladyslav Kozlovskyy (Ukraine) <dbdevelop©gmail.com>
|
|||
|
| for Web2py project
|
|||
|
|
|||
|
Utilities and class for UTF8 strings managing
|
|||
|
----------------------------------------------
|
|||
|
"""
|
|||
|
from __future__ import print_function
|
|||
|
from gluon._compat import builtin as __builtin__, unicodeT, iteritems, to_unicode, to_native, reload
|
|||
|
|
|||
|
__all__ = ['Utf8']
|
|||
|
|
|||
|
repr_escape_tab = {}
|
|||
|
#FIXME PY3
|
|||
|
for i in range(1, 32):
|
|||
|
repr_escape_tab[i] = to_unicode("\\"+"x%02x" % i)
|
|||
|
repr_escape_tab[7] = u'\\a'
|
|||
|
repr_escape_tab[8] = u'\\b'
|
|||
|
repr_escape_tab[9] = u'\\t'
|
|||
|
repr_escape_tab[10] = u'\\n'
|
|||
|
repr_escape_tab[11] = u'\\v'
|
|||
|
repr_escape_tab[12] = u'\\f'
|
|||
|
repr_escape_tab[13] = u'\\r'
|
|||
|
repr_escape_tab[ord('\\')] = u'\\\\'
|
|||
|
repr_escape_tab2 = repr_escape_tab.copy()
|
|||
|
repr_escape_tab2[ord('\'')] = u"\\'"
|
|||
|
|
|||
|
|
|||
|
def sort_key(s):
|
|||
|
"""Unicode Collation Algorithm (UCA) (http://www.unicode.org/reports/tr10/)
|
|||
|
is used for utf-8 and unicode strings sorting and for utf-8 strings
|
|||
|
comparison
|
|||
|
|
|||
|
Note:
|
|||
|
pyuca is a very memory cost module! It loads the whole
|
|||
|
"allkey.txt" file (~2mb!) into the memory. But this
|
|||
|
functionality is needed only when sort_key() is called as a
|
|||
|
part of sort() function or when Utf8 strings are compared.
|
|||
|
|
|||
|
So, it is a lazy "sort_key" function which (ONLY ONCE, ON ITS
|
|||
|
FIRST CALL) imports pyuca and replaces itself with a real
|
|||
|
sort_key() function
|
|||
|
"""
|
|||
|
global sort_key
|
|||
|
try:
|
|||
|
from gluon.contrib.pyuca import unicode_collator
|
|||
|
unicode_sort_key = unicode_collator.sort_key
|
|||
|
sort_key = lambda s: unicode_sort_key(
|
|||
|
to_unicode(s, 'utf-8') if isinstance(s, str) else s)
|
|||
|
except:
|
|||
|
sort_key = lambda s: (
|
|||
|
to_unicode(s, 'utf-8') if isinstance(s, str) else s).lower()
|
|||
|
return sort_key(s)
|
|||
|
|
|||
|
|
|||
|
def ord(char):
|
|||
|
"""Returns unicode id for utf8 or unicode *char* character
|
|||
|
SUPPOSE that *char* is an utf-8 or unicode character only
|
|||
|
"""
|
|||
|
if isinstance(char, unicodeT):
|
|||
|
return __builtin__.ord(char)
|
|||
|
return __builtin__.ord(to_unicode(char, 'utf-8'))
|
|||
|
|
|||
|
|
|||
|
def chr(code):
|
|||
|
"""Returns utf8-character with *code* unicode id """
|
|||
|
return Utf8(unichr(code))
|
|||
|
|
|||
|
|
|||
|
def size(string):
|
|||
|
"""Returns length of utf-8 string in bytes
|
|||
|
|
|||
|
Note:
|
|||
|
The length of correspondent utf-8 string is returned for unicode string
|
|||
|
"""
|
|||
|
return Utf8(string).__size__()
|
|||
|
|
|||
|
|
|||
|
def truncate(string, length, dots='...'):
|
|||
|
"""Returns string of length < *length* or truncate string with adding
|
|||
|
*dots* suffix to the string's end
|
|||
|
|
|||
|
Args:
|
|||
|
length (int): max length of string
|
|||
|
dots (str or unicode): string suffix, when string is cutted
|
|||
|
|
|||
|
Returns:
|
|||
|
(utf8-str): original or cutted string
|
|||
|
"""
|
|||
|
text = to_unicode(string, 'utf-8')
|
|||
|
dots = to_unicode(dots, 'utf-8') if isinstance(dots, str) else dots
|
|||
|
if len(text) > length:
|
|||
|
text = text[:length - len(dots)] + dots
|
|||
|
return str.__new__(Utf8, text.encode('utf-8'))
|
|||
|
|
|||
|
|
|||
|
class Utf8(str):
|
|||
|
"""
|
|||
|
Class for utf8 string storing and manipulations
|
|||
|
|
|||
|
The base presupposition of this class usage is:
|
|||
|
"ALL strings in the application are either of
|
|||
|
utf-8 or unicode type, even when simple str
|
|||
|
type is used. UTF-8 is only a "packed" version
|
|||
|
of unicode, so Utf-8 and unicode strings are
|
|||
|
interchangeable."
|
|||
|
|
|||
|
CAUTION! This class is slower than str/unicode!
|
|||
|
Do NOT use it inside intensive loops. Simply
|
|||
|
decode string(s) to unicode before loop and
|
|||
|
encode it back to utf-8 string(s) after
|
|||
|
intensive calculation.
|
|||
|
|
|||
|
You can see the benefit of this class in doctests() below
|
|||
|
"""
|
|||
|
def __new__(cls, content='', codepage='utf-8'):
|
|||
|
if isinstance(content, unicodeT):
|
|||
|
return str.__new__(cls, to_native(content, 'utf-8'))
|
|||
|
elif codepage in ('utf-8', 'utf8') or isinstance(content, cls):
|
|||
|
return str.__new__(cls, content)
|
|||
|
else:
|
|||
|
return str.__new__(cls, to_native(to_unicode(content, codepage), 'utf-8'))
|
|||
|
|
|||
|
def __repr__(self):
|
|||
|
r''' # note that we use raw strings to avoid having to use double back slashes below
|
|||
|
NOTE! This function is a clone of web2py:gluon.languages.utf_repl() function::
|
|||
|
|
|||
|
utf8.__repr__() works same as str.repr() when processing ascii string
|
|||
|
>>> repr(Utf8('abc')) == repr(Utf8("abc")) == repr('abc') == repr("abc") == "'abc'"
|
|||
|
True
|
|||
|
>>> repr(Utf8('a"b"c')) == repr('a"b"c') == '\'a"b"c\''
|
|||
|
True
|
|||
|
>>> repr(Utf8("a'b'c")) == repr("a'b'c") == '"a\'b\'c"'
|
|||
|
True
|
|||
|
>>> repr(Utf8('a\'b"c')) == repr('a\'b"c') == repr(Utf8("a'b\"c")) == repr("a'b\"c") == '\'a\\\'b"c\''
|
|||
|
True
|
|||
|
>>> repr(Utf8('a\r\nb')) == repr('a\r\nb') == "'a\\r\\nb'" # Test for \r, \n
|
|||
|
True
|
|||
|
|
|||
|
Unlike str.repr(), Utf8.__repr__() remains utf8 content when processing utf8 string::
|
|||
|
|
|||
|
>>> repr(Utf8('中文字')) == repr(Utf8("中文字")) == "'中文字'" != repr('中文字')
|
|||
|
True
|
|||
|
>>> repr(Utf8('中"文"字')) == "'中\"文\"字'" != repr('中"文"字')
|
|||
|
True
|
|||
|
>>> repr(Utf8("中'文'字")) == '"中\'文\'字"' != repr("中'文'字")
|
|||
|
True
|
|||
|
>>> repr(Utf8('中\'文"字')) == repr(Utf8("中'文\"字")) == '\'中\\\'文"字\'' != repr('中\'文"字') == repr("中'文\"字")
|
|||
|
True
|
|||
|
>>> repr(Utf8('中\r\n文')) == "'中\\r\\n文'" != repr('中\r\n文') # Test for \r, \n
|
|||
|
True
|
|||
|
'''
|
|||
|
if str.find(self, "'") >= 0 and str.find(self, '"') < 0: # only single quote exists
|
|||
|
return '"' + to_native(to_unicode(self, 'utf-8').translate(repr_escape_tab), 'utf-8') + '"'
|
|||
|
else:
|
|||
|
return "'" + to_native(to_unicode(self, 'utf-8').translate(repr_escape_tab2), 'utf-8') + "'"
|
|||
|
|
|||
|
def __size__(self):
|
|||
|
""" length of utf-8 string in bytes """
|
|||
|
return str.__len__(self)
|
|||
|
|
|||
|
def __contains__(self, other):
|
|||
|
return str.__contains__(self, Utf8(other))
|
|||
|
|
|||
|
def __getitem__(self, index):
|
|||
|
return str.__new__(Utf8, to_native(to_unicode(self, 'utf-8')[index], 'utf-8'))
|
|||
|
|
|||
|
def __getslice__(self, begin, end):
|
|||
|
return str.__new__(Utf8, to_native(to_unicode(self, 'utf-8')[begin:end], 'utf-8'))
|
|||
|
|
|||
|
def __add__(self, other):
|
|||
|
return str.__new__(Utf8, str.__add__(self, unicode.encode(other, 'utf-8')
|
|||
|
if isinstance(other, unicode) else other))
|
|||
|
|
|||
|
def __len__(self):
|
|||
|
return len(to_unicode(self, 'utf-8'))
|
|||
|
|
|||
|
def __mul__(self, integer):
|
|||
|
return str.__new__(Utf8, str.__mul__(self, integer))
|
|||
|
|
|||
|
def __eq__(self, string):
|
|||
|
return str.__eq__(self, Utf8(string))
|
|||
|
|
|||
|
def __ne__(self, string):
|
|||
|
return str.__ne__(self, Utf8(string))
|
|||
|
|
|||
|
def capitalize(self):
|
|||
|
return str.__new__(Utf8, unicode(self, 'utf-8').capitalize().encode('utf-8'))
|
|||
|
|
|||
|
def center(self, length):
|
|||
|
return str.__new__(Utf8, unicode(self, 'utf-8').center(length).encode('utf-8'))
|
|||
|
|
|||
|
def upper(self):
|
|||
|
return str.__new__(Utf8, unicode(self, 'utf-8').upper().encode('utf-8'))
|
|||
|
|
|||
|
def lower(self):
|
|||
|
return str.__new__(Utf8, unicode(self, 'utf-8').lower().encode('utf-8'))
|
|||
|
|
|||
|
def title(self):
|
|||
|
return str.__new__(Utf8, unicode(self, 'utf-8').title().encode('utf-8'))
|
|||
|
|
|||
|
def index(self, string):
|
|||
|
return unicode(self, 'utf-8').index(string if isinstance(string, unicode) else unicode(string, 'utf-8'))
|
|||
|
|
|||
|
def isalnum(self):
|
|||
|
return unicode(self, 'utf-8').isalnum()
|
|||
|
|
|||
|
def isalpha(self):
|
|||
|
return unicode(self, 'utf-8').isalpha()
|
|||
|
|
|||
|
def isdigit(self):
|
|||
|
return unicode(self, 'utf-8').isdigit()
|
|||
|
|
|||
|
def islower(self):
|
|||
|
return unicode(self, 'utf-8').islower()
|
|||
|
|
|||
|
def isspace(self):
|
|||
|
return unicode(self, 'utf-8').isspace()
|
|||
|
|
|||
|
def istitle(self):
|
|||
|
return unicode(self, 'utf-8').istitle()
|
|||
|
|
|||
|
def isupper(self):
|
|||
|
return unicode(self, 'utf-8').isupper()
|
|||
|
|
|||
|
def zfill(self, length):
|
|||
|
return str.__new__(Utf8, unicode(self, 'utf-8').zfill(length).encode('utf-8'))
|
|||
|
|
|||
|
def join(self, iter):
|
|||
|
return str.__new__(Utf8, str.join(self, [Utf8(c) for c in
|
|||
|
list(unicode(iter, 'utf-8') if
|
|||
|
isinstance(iter, str) else
|
|||
|
iter)]))
|
|||
|
|
|||
|
def lstrip(self, chars=None):
|
|||
|
return str.__new__(Utf8, str.lstrip(self, None if chars is None else Utf8(chars)))
|
|||
|
|
|||
|
def rstrip(self, chars=None):
|
|||
|
return str.__new__(Utf8, str.rstrip(self, None if chars is None else Utf8(chars)))
|
|||
|
|
|||
|
def strip(self, chars=None):
|
|||
|
return str.__new__(Utf8, str.strip(self, None if chars is None else Utf8(chars)))
|
|||
|
|
|||
|
def swapcase(self):
|
|||
|
return str.__new__(Utf8, unicode(self, 'utf-8').swapcase().encode('utf-8'))
|
|||
|
|
|||
|
def count(self, sub, start=0, end=None):
|
|||
|
unistr = unicode(self, 'utf-8')
|
|||
|
return unistr.count(
|
|||
|
unicode(sub, 'utf-8') if isinstance(sub, str) else sub,
|
|||
|
start, len(unistr) if end is None else end)
|
|||
|
|
|||
|
def decode(self, encoding='utf-8', errors='strict'):
|
|||
|
return str.decode(self, encoding, errors)
|
|||
|
|
|||
|
def encode(self, encoding, errors='strict'):
|
|||
|
return unicode(self, 'utf-8').encode(encoding, errors)
|
|||
|
|
|||
|
def expandtabs(self, tabsize=8):
|
|||
|
return str.__new__(Utf8, unicode(self, 'utf-8').expandtabs(tabsize).encode('utf-8'))
|
|||
|
|
|||
|
def find(self, sub, start=None, end=None):
|
|||
|
return unicode(self, 'utf-8').find(unicode(sub, 'utf-8')
|
|||
|
if isinstance(sub, str) else sub, start, end)
|
|||
|
|
|||
|
def ljust(self, width, fillchar=' '):
|
|||
|
return str.__new__(Utf8, unicode(self, 'utf-8').ljust(width, unicode(fillchar, 'utf-8')
|
|||
|
if isinstance(fillchar, str) else fillchar).encode('utf-8'))
|
|||
|
|
|||
|
def partition(self, sep):
|
|||
|
(head, sep, tail) = str.partition(self, Utf8(sep))
|
|||
|
return (str.__new__(Utf8, head),
|
|||
|
str.__new__(Utf8, sep),
|
|||
|
str.__new__(Utf8, tail))
|
|||
|
|
|||
|
def replace(self, old, new, count=-1):
|
|||
|
return str.__new__(Utf8, str.replace(self, Utf8(old), Utf8(new), count))
|
|||
|
|
|||
|
def rfind(self, sub, start=None, end=None):
|
|||
|
return unicode(self, 'utf-8').rfind(unicode(sub, 'utf-8')
|
|||
|
if isinstance(sub, str) else sub, start, end)
|
|||
|
|
|||
|
def rindex(self, string):
|
|||
|
return unicode(self, 'utf-8').rindex(string if isinstance(string, unicode)
|
|||
|
else unicode(string, 'utf-8'))
|
|||
|
|
|||
|
def rjust(self, width, fillchar=' '):
|
|||
|
return str.__new__(Utf8, unicode(self, 'utf-8').rjust(width, unicode(fillchar, 'utf-8')
|
|||
|
if isinstance(fillchar, str) else fillchar).encode('utf-8'))
|
|||
|
|
|||
|
def rpartition(self, sep):
|
|||
|
(head, sep, tail) = str.rpartition(self, Utf8(sep))
|
|||
|
return (str.__new__(Utf8, head),
|
|||
|
str.__new__(Utf8, sep),
|
|||
|
str.__new__(Utf8, tail))
|
|||
|
|
|||
|
def rsplit(self, sep=None, maxsplit=-1):
|
|||
|
return [str.__new__(Utf8, part) for part in str.rsplit(self,
|
|||
|
None if sep is None else Utf8(sep), maxsplit)]
|
|||
|
|
|||
|
def split(self, sep=None, maxsplit=-1):
|
|||
|
return [str.__new__(Utf8, part) for part in str.split(self,
|
|||
|
None if sep is None else Utf8(sep), maxsplit)]
|
|||
|
|
|||
|
def splitlines(self, keepends=False):
|
|||
|
return [str.__new__(Utf8, part) for part in str.splitlines(self, keepends)]
|
|||
|
|
|||
|
def startswith(self, prefix, start=0, end=None):
|
|||
|
unistr = unicode(self, 'utf-8')
|
|||
|
if isinstance(prefix, tuple):
|
|||
|
prefix = tuple(unicode(
|
|||
|
s, 'utf-8') if isinstance(s, str) else s for s in prefix)
|
|||
|
elif isinstance(prefix, str):
|
|||
|
prefix = unicode(prefix, 'utf-8')
|
|||
|
return unistr.startswith(prefix, start, len(unistr) if end is None else end)
|
|||
|
|
|||
|
def translate(self, table, deletechars=''):
|
|||
|
if isinstance(table, dict):
|
|||
|
return str.__new__(Utf8, unicode(self, 'utf-8').translate(table).encode('utf-8'))
|
|||
|
else:
|
|||
|
return str.__new__(Utf8, str.translate(self, table, deletechars))
|
|||
|
|
|||
|
def endswith(self, prefix, start=0, end=None):
|
|||
|
unistr = unicode(self, 'utf-8')
|
|||
|
if isinstance(prefix, tuple):
|
|||
|
prefix = tuple(unicode(
|
|||
|
s, 'utf-8') if isinstance(s, str) else s for s in prefix)
|
|||
|
elif isinstance(prefix, str):
|
|||
|
prefix = unicode(prefix, 'utf-8')
|
|||
|
return unistr.endswith(prefix, start, len(unistr) if end is None else end)
|
|||
|
if hasattr(str, 'format'): # Python 2.5 hasn't got str.format() method
|
|||
|
def format(self, *args, **kwargs):
|
|||
|
args = [unicode(
|
|||
|
s, 'utf-8') if isinstance(s, str) else s for s in args]
|
|||
|
kwargs = dict((unicode(k, 'utf-8') if isinstance(k, str) else k,
|
|||
|
unicode(v, 'utf-8') if isinstance(v, str) else v)
|
|||
|
for k, v in iteritems(kwargs))
|
|||
|
return str.__new__(Utf8, unicode(self, 'utf-8').format(*args, **kwargs).encode('utf-8'))
|
|||
|
|
|||
|
def __mod__(self, right):
|
|||
|
if isinstance(right, tuple):
|
|||
|
right = tuple(unicode(v, 'utf-8') if isinstance(v, str) else v
|
|||
|
for v in right)
|
|||
|
elif isinstance(right, dict):
|
|||
|
right = dict((unicode(k, 'utf-8') if isinstance(k, str) else k,
|
|||
|
unicode(v, 'utf-8') if isinstance(v, str) else v)
|
|||
|
for k, v in iteritems(right))
|
|||
|
elif isinstance(right, str):
|
|||
|
right = unicode(right, 'utf-8')
|
|||
|
return str.__new__(Utf8, unicode(self, 'utf-8').__mod__(right).encode('utf-8'))
|
|||
|
|
|||
|
def __ge__(self, string):
|
|||
|
return sort_key(self) >= sort_key(string)
|
|||
|
|
|||
|
def __gt__(self, string):
|
|||
|
return sort_key(self) > sort_key(string)
|
|||
|
|
|||
|
def __le__(self, string):
|
|||
|
return sort_key(self) <= sort_key(string)
|
|||
|
|
|||
|
def __lt__(self, string):
|
|||
|
return sort_key(self) < sort_key(string)
|
|||
|
|
|||
|
|
|||
|
if __name__ == '__main__':
|
|||
|
def doctests():
|
|||
|
u"""
|
|||
|
doctests:
|
|||
|
>>> test_unicode=u'ПРоба Є PRobe'
|
|||
|
>>> test_unicode_word=u'ПРоба'
|
|||
|
>>> test_number_str='12345'
|
|||
|
>>> test_unicode
|
|||
|
u'\\u041f\\u0420\\u043e\\u0431\\u0430 \\u0404 PRobe'
|
|||
|
>>> print test_unicode
|
|||
|
ПРоба Є PRobe
|
|||
|
>>> test_word=test_unicode_word.encode('utf-8')
|
|||
|
>>> test_str=test_unicode.encode('utf-8')
|
|||
|
>>> s=Utf8(test_str)
|
|||
|
>>> s
|
|||
|
'ПРоба Є PRobe'
|
|||
|
>>> type(s)
|
|||
|
<class '__main__.Utf8'>
|
|||
|
>>> s == test_str
|
|||
|
True
|
|||
|
>>> len(test_str) # wrong length of utf8-string!
|
|||
|
19
|
|||
|
>>> len(test_unicode) # RIGHT!
|
|||
|
13
|
|||
|
>>> len(s) # RIGHT!
|
|||
|
13
|
|||
|
>>> size(test_str) # size of utf-8 string (in bytes) == len(str)
|
|||
|
19
|
|||
|
>>> size(test_unicode) # size of unicode string in bytes (packed to utf-8 string)
|
|||
|
19
|
|||
|
>>> size(s) # size of utf-8 string in bytes
|
|||
|
19
|
|||
|
>>> try: # utf-8 is a multibyte string. Convert it to unicode for use with builtin ord()
|
|||
|
... __builtin__.ord('б') # ascii string
|
|||
|
... except Exception, e:
|
|||
|
... print 'Exception:', e
|
|||
|
Exception: ord() expected a character, but string of length 2 found
|
|||
|
>>> ord('б') # utf8.ord() is used(!!!)
|
|||
|
1073
|
|||
|
>>> ord(u'б') # utf8.ord() is used(!!!)
|
|||
|
1073
|
|||
|
>>> ord(s[3]) # utf8.ord() is used(!!!)
|
|||
|
1073
|
|||
|
>>> chr(ord(s[3])) # utf8.chr() and utf8.chr() is used(!!!)
|
|||
|
'б'
|
|||
|
>>> type(chr(1073)) # utf8.chr() is used(!!!)
|
|||
|
<class '__main__.Utf8'>
|
|||
|
>>> s=Utf8(test_unicode)
|
|||
|
>>> s
|
|||
|
'ПРоба Є PRobe'
|
|||
|
>>> s == test_str
|
|||
|
True
|
|||
|
>>> test_str == s
|
|||
|
True
|
|||
|
>>> s == test_unicode
|
|||
|
True
|
|||
|
>>> test_unicode == s
|
|||
|
True
|
|||
|
>>> print test_str.upper() # only ASCII characters uppered
|
|||
|
ПРоба Є PROBE
|
|||
|
>>> print test_unicode.upper() # unicode gives right result
|
|||
|
ПРОБА Є PROBE
|
|||
|
>>> s.upper() # utf8 class use unicode.upper()
|
|||
|
'ПРОБА Є PROBE'
|
|||
|
>>> type(s.upper())
|
|||
|
<class '__main__.Utf8'>
|
|||
|
>>> s.lower()
|
|||
|
'проба є probe'
|
|||
|
>>> type(s.lower())
|
|||
|
<class '__main__.Utf8'>
|
|||
|
>>> s.capitalize()
|
|||
|
'Проба є probe'
|
|||
|
>>> type(s.capitalize())
|
|||
|
<class '__main__.Utf8'>
|
|||
|
>>> len(s)
|
|||
|
13
|
|||
|
>>> len(test_unicode)
|
|||
|
13
|
|||
|
>>> s+'. Probe is проба'
|
|||
|
'ПРоба Є PRobe. Probe is проба'
|
|||
|
>>> type(s+'. Probe is проба')
|
|||
|
<class '__main__.Utf8'>
|
|||
|
>>> s+u'. Probe is проба'
|
|||
|
'ПРоба Є PRobe. Probe is проба'
|
|||
|
>>> type(s+u'. Probe is проба')
|
|||
|
<class '__main__.Utf8'>
|
|||
|
>>> s+s
|
|||
|
'ПРоба Є PRobeПРоба Є PRobe'
|
|||
|
>>> type(s+s)
|
|||
|
<class '__main__.Utf8'>
|
|||
|
>>> a=s
|
|||
|
>>> a+=s
|
|||
|
>>> a+=test_unicode
|
|||
|
>>> a+=test_str
|
|||
|
>>> a
|
|||
|
'ПРоба Є PRobeПРоба Є PRobeПРоба Є PRobeПРоба Є PRobe'
|
|||
|
>>> type(a)
|
|||
|
<class '__main__.Utf8'>
|
|||
|
>>> s*3
|
|||
|
'ПРоба Є PRobeПРоба Є PRobeПРоба Є PRobe'
|
|||
|
>>> type(s*3)
|
|||
|
<class '__main__.Utf8'>
|
|||
|
>>> a=Utf8("-проба-")
|
|||
|
>>> a*=10
|
|||
|
>>> a
|
|||
|
'-проба--проба--проба--проба--проба--проба--проба--проба--проба--проба-'
|
|||
|
>>> type(a)
|
|||
|
<class '__main__.Utf8'>
|
|||
|
>>> print "'"+test_str.center(17)+"'" # WRONG RESULT!
|
|||
|
'ПРоба Є PRobe'
|
|||
|
>>> s.center(17) # RIGHT!
|
|||
|
' ПРоба Є PRobe '
|
|||
|
>>> type(s.center(17))
|
|||
|
<class '__main__.Utf8'>
|
|||
|
>>> (test_word+test_number_str).isalnum() # WRONG RESULT! non ASCII chars are detected as non alpha
|
|||
|
False
|
|||
|
>>> Utf8(test_word+test_number_str).isalnum()
|
|||
|
True
|
|||
|
>>> s.isalnum()
|
|||
|
False
|
|||
|
>>> test_word.isalpha() # WRONG RESULT! Non ASCII characters are detected as non alpha
|
|||
|
False
|
|||
|
>>> Utf8(test_word).isalpha() # RIGHT!
|
|||
|
True
|
|||
|
>>> s.lower().islower()
|
|||
|
True
|
|||
|
>>> s.upper().isupper()
|
|||
|
True
|
|||
|
>>> print test_str.zfill(17) # WRONG RESULT!
|
|||
|
ПРоба Є PRobe
|
|||
|
>>> s.zfill(17) # RIGHT!
|
|||
|
'0000ПРоба Є PRobe'
|
|||
|
>>> type(s.zfill(17))
|
|||
|
<class '__main__.Utf8'>
|
|||
|
>>> s.istitle()
|
|||
|
False
|
|||
|
>>> s.title().istitle()
|
|||
|
True
|
|||
|
>>> Utf8('1234').isdigit()
|
|||
|
True
|
|||
|
>>> Utf8(' \t').isspace()
|
|||
|
True
|
|||
|
>>> s.join('•|•')
|
|||
|
'•ПРоба Є PRobe|ПРоба Є PRobe•'
|
|||
|
>>> s.join((str('(utf8 тест1)'), unicode('(unicode тест2)','utf-8'), '(ascii test3)'))
|
|||
|
'(utf8 тест1)ПРоба Є PRobe(unicode тест2)ПРоба Є PRobe(ascii test3)'
|
|||
|
>>> type(s)
|
|||
|
<class '__main__.Utf8'>
|
|||
|
>>> s==test_str
|
|||
|
True
|
|||
|
>>> s==test_unicode
|
|||
|
True
|
|||
|
>>> s.swapcase()
|
|||
|
'прОБА є prOBE'
|
|||
|
>>> type(s.swapcase())
|
|||
|
<class '__main__.Utf8'>
|
|||
|
>>> truncate(s, 10)
|
|||
|
'ПРоба Є...'
|
|||
|
>>> truncate(s, 20)
|
|||
|
'ПРоба Є PRobe'
|
|||
|
>>> truncate(s, 10, '•••') # utf-8 string as *dots*
|
|||
|
'ПРоба Є•••'
|
|||
|
>>> truncate(s, 10, u'®') # you can use unicode string as *dots*
|
|||
|
'ПРоба Є P®'
|
|||
|
>>> type(truncate(s, 10))
|
|||
|
<class '__main__.Utf8'>
|
|||
|
>>> Utf8(s.encode('koi8-u'), 'koi8-u')
|
|||
|
'ПРоба Є PRobe'
|
|||
|
>>> s.decode() # convert utf-8 string to unicode
|
|||
|
u'\\u041f\\u0420\\u043e\\u0431\\u0430 \\u0404 PRobe'
|
|||
|
>>> a='про\\tba'
|
|||
|
>>> str_tmp=a.expandtabs()
|
|||
|
>>> utf8_tmp=Utf8(a).expandtabs()
|
|||
|
>>> utf8_tmp.replace(' ','.') # RIGHT! (default tabsize is 8)
|
|||
|
'про.....ba'
|
|||
|
>>> utf8_tmp.index('b')
|
|||
|
8
|
|||
|
>>> print "'"+str_tmp.replace(' ','.')+"'" # WRONG STRING LENGTH!
|
|||
|
'про..ba'
|
|||
|
>>> str_tmp.index('b') # WRONG index of 'b' character
|
|||
|
8
|
|||
|
>>> print "'"+a.expandtabs(4).replace(' ','.')+"'" # WRONG RESULT!
|
|||
|
'про..ba'
|
|||
|
>>> Utf8(a).expandtabs(4).replace(' ','.') # RIGHT!
|
|||
|
'про.ba'
|
|||
|
>>> s.find('Є')
|
|||
|
6
|
|||
|
>>> s.find(u'Є')
|
|||
|
6
|
|||
|
>>> s.find(' ', 6)
|
|||
|
7
|
|||
|
>>> s.rfind(' ')
|
|||
|
7
|
|||
|
>>> s.partition('Є')
|
|||
|
('ПРоба ', 'Є', ' PRobe')
|
|||
|
>>> s.partition(u'Є')
|
|||
|
('ПРоба ', 'Є', ' PRobe')
|
|||
|
>>> (a,b,c) = s.partition('Є')
|
|||
|
>>> type(a), type(b), type(c)
|
|||
|
(<class '__main__.Utf8'>, <class '__main__.Utf8'>, <class '__main__.Utf8'>)
|
|||
|
>>> s.partition(' ')
|
|||
|
('ПРоба', ' ', 'Є PRobe')
|
|||
|
>>> s.rpartition(' ')
|
|||
|
('ПРоба Є', ' ', 'PRobe')
|
|||
|
>>> s.index('Є')
|
|||
|
6
|
|||
|
>>> s.rindex(u'Є')
|
|||
|
6
|
|||
|
>>> s.index(' ')
|
|||
|
5
|
|||
|
>>> s.rindex(' ')
|
|||
|
7
|
|||
|
>>> a=Utf8('а б ц д е а б ц д е а\\tб ц д е')
|
|||
|
>>> a.split()
|
|||
|
['а', 'б', 'ц', 'д', 'е', 'а', 'б', 'ц', 'д', 'е', 'а', 'б', 'ц', 'д', 'е']
|
|||
|
>>> a.rsplit()
|
|||
|
['а', 'б', 'ц', 'д', 'е', 'а', 'б', 'ц', 'д', 'е', 'а', 'б', 'ц', 'д', 'е']
|
|||
|
>>> a.expandtabs().split('б')
|
|||
|
['а ', ' ц д е а ', ' ц д е а ', ' ц д е']
|
|||
|
>>> a.expandtabs().rsplit('б')
|
|||
|
['а ', ' ц д е а ', ' ц д е а ', ' ц д е']
|
|||
|
>>> a.expandtabs().split(u'б', 1)
|
|||
|
['а ', ' ц д е а б ц д е а б ц д е']
|
|||
|
>>> a.expandtabs().rsplit(u'б', 1)
|
|||
|
['а б ц д е а б ц д е а ', ' ц д е']
|
|||
|
>>> a=Utf8("рядок1\\nрядок2\\nрядок3")
|
|||
|
>>> a.splitlines()
|
|||
|
['рядок1', 'рядок2', 'рядок3']
|
|||
|
>>> a.splitlines(True)
|
|||
|
['рядок1\\n', 'рядок2\\n', 'рядок3']
|
|||
|
>>> s[6]
|
|||
|
'Є'
|
|||
|
>>> s[0]
|
|||
|
'П'
|
|||
|
>>> s[-1]
|
|||
|
'e'
|
|||
|
>>> s[:10]
|
|||
|
'ПРоба Є PR'
|
|||
|
>>> s[2:-2:2]
|
|||
|
'оаЄPo'
|
|||
|
>>> s[::-1]
|
|||
|
'eboRP Є абоРП'
|
|||
|
>>> s.startswith('ПР')
|
|||
|
True
|
|||
|
>>> s.startswith(('ПР', u'об'),0)
|
|||
|
True
|
|||
|
>>> s.startswith(u'об', 2, 4)
|
|||
|
True
|
|||
|
>>> s.endswith('be')
|
|||
|
True
|
|||
|
>>> s.endswith(('be', 'PR', u'Є'))
|
|||
|
True
|
|||
|
>>> s.endswith('PR', 8, 10)
|
|||
|
True
|
|||
|
>>> s.endswith('Є', -7, -6)
|
|||
|
True
|
|||
|
>>> s.count(' ')
|
|||
|
2
|
|||
|
>>> s.count(' ',6)
|
|||
|
1
|
|||
|
>>> s.count(u'Є')
|
|||
|
1
|
|||
|
>>> s.count('Є', 0, 5)
|
|||
|
0
|
|||
|
>>> Utf8("Parameters: '%(проба)s', %(probe)04d, %(проба2)s") % { u"проба": s,
|
|||
|
... "not used": "???", "probe": 2, "проба2": u"ПРоба Probe" }
|
|||
|
"Parameters: 'ПРоба Є PRobe', 0002, ПРоба Probe"
|
|||
|
>>> a=Utf8(u"Параметр: (%s)-(%s)-[%s]")
|
|||
|
>>> a%=(s, s[::-1], 1000)
|
|||
|
>>> a
|
|||
|
'Параметр: (ПРоба Є PRobe)-(eboRP Є абоРП)-[1000]'
|
|||
|
>>> if hasattr(Utf8, 'format'):
|
|||
|
... Utf8("Проба <{0}>, {1}, {param1}, {param2}").format(s, u"中文字",
|
|||
|
... param1="барабан", param2=1000) == 'Проба <ПРоба Є PRobe>, 中文字, барабан, 1000'
|
|||
|
... else: # format() method is not used in python with version <2.6:
|
|||
|
... print True
|
|||
|
True
|
|||
|
>>> u'Б'<u'Ї' # WRONG ORDER!
|
|||
|
False
|
|||
|
>>> 'Б'<'Ї' # WRONG ORDER!
|
|||
|
False
|
|||
|
>>> Utf8('Б')<'Ї' # RIGHT!
|
|||
|
True
|
|||
|
>>> u'д'>u'ґ' # WRONG ORDER!
|
|||
|
False
|
|||
|
>>> Utf8('д')>Utf8('ґ') # RIGHT!
|
|||
|
True
|
|||
|
>>> u'є'<=u'ж' # WRONG ORDER!
|
|||
|
False
|
|||
|
>>> Utf8('є')<=u'ж' # RIGHT!
|
|||
|
True
|
|||
|
>>> Utf8('є')<=u'є'
|
|||
|
True
|
|||
|
>>> u'Ї'>=u'И' # WRONG ORDER!
|
|||
|
False
|
|||
|
>>> Utf8(u'Ї') >= u'И' # RIGHT
|
|||
|
True
|
|||
|
>>> Utf8('Є') >= 'Є'
|
|||
|
True
|
|||
|
>>> a="яжертиуіопшщїасдфгґхйклчєзьцвбнмюЯЖЕРТИУІОПШЩЇАСДФГҐХЙКЛЧЗЬЦВБНМЮЄ" # str type
|
|||
|
>>> b=u"яжертиуіопшщїасдфгґхйклчєзьцвбнмюЯЖЕРТИУІОПШЩЇАСДФГҐХЙКЛЧЗЬЦВБНМЮЄ" # unicode type
|
|||
|
>>> c=Utf8("яжертиуіопшщїасдфгґхйклчєзьцвбнмюЯЖЕРТИУІОПШЩЇАСДФГҐХЙКЛЧЗЬЦВБНМЮЄ") # utf8 class
|
|||
|
>>> result = "".join(sorted(a))
|
|||
|
>>> result[0:20] # result is not utf8 string, because bytes, not utf8-characters were sorted
|
|||
|
'\\x80\\x81\\x82\\x83\\x84\\x84\\x85\\x86\\x86\\x87\\x87\\x88\\x89\\x8c\\x8e\\x8f\\x90\\x90\\x91\\x91'
|
|||
|
>>> try:
|
|||
|
... unicode(result, 'utf-8') # try to convert result (utf-8?) to unicode
|
|||
|
... except Exception, e:
|
|||
|
... print 'Exception:', e
|
|||
|
Exception: 'utf8' codec can't decode byte 0x80 in position 0: unexpected code byte
|
|||
|
>>> try: # FAILED! (working with bytes, not with utf8-charactes)
|
|||
|
... "".join( sorted(a, key=sort_key) ) # utf8.sort_key may be used with utf8 or unicode strings only!
|
|||
|
... except Exception, e:
|
|||
|
... print 'Exception:', e
|
|||
|
Exception: 'utf8' codec can't decode byte 0xd1 in position 0: unexpected end of data
|
|||
|
>>> print "".join( sorted(Utf8(a))) # converting *a* to unicode or utf8-string gives us correct result
|
|||
|
аАбБвВгГґҐдДеЕєЄжЖзЗиИіІїЇйЙкКлЛмМнНоОпПрРсСтТуУфФхХцЦчЧшШщЩьЬюЮяЯ
|
|||
|
>>> print u"".join( sorted(b) ) # WRONG ORDER! Default sort key is used
|
|||
|
ЄІЇАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЬЮЯабвгдежзийклмнопрстуфхцчшщьюяєіїҐґ
|
|||
|
>>> print u"".join( sorted(b, key=sort_key) ) # RIGHT ORDER! utf8.sort_key is used
|
|||
|
аАбБвВгГґҐдДеЕєЄжЖзЗиИіІїЇйЙкКлЛмМнНоОпПрРсСтТуУфФхХцЦчЧшШщЩьЬюЮяЯ
|
|||
|
>>> print "".join( sorted(c) ) # RIGHT ORDER! Utf8 "rich comparison" methods are used
|
|||
|
аАбБвВгГґҐдДеЕєЄжЖзЗиИіІїЇйЙкКлЛмМнНоОпПрРсСтТуУфФхХцЦчЧшШщЩьЬюЮяЯ
|
|||
|
>>> print "".join( sorted(c, key=sort_key) ) # RIGHT ORDER! utf8.sort_key is used
|
|||
|
аАбБвВгГґҐдДеЕєЄжЖзЗиИіІїЇйЙкКлЛмМнНоОпПрРсСтТуУфФхХцЦчЧшШщЩьЬюЮяЯ
|
|||
|
>>> Utf8().join(sorted(c.decode(), key=sort_key)) # convert to unicode for better performance
|
|||
|
'аАбБвВгГґҐдДеЕєЄжЖзЗиИіІїЇйЙкКлЛмМнНоОпПрРсСтТуУфФхХцЦчЧшШщЩьЬюЮяЯ'
|
|||
|
>>> for result in sorted(["Іа", "Астро", u"гала", Utf8("Гоша"), "Єва", "шовк", "аякс", "Їжа",
|
|||
|
... "ґанок", Utf8("Дар'я"), "білінг", "веб", u"Жужа", "проба", u"тест",
|
|||
|
... "абетка", "яблуко", "Юляся", "Київ", "лимонад", "ложка", "Матриця",
|
|||
|
... ], key=sort_key):
|
|||
|
... print result.ljust(20), type(result)
|
|||
|
абетка <type 'str'>
|
|||
|
Астро <type 'str'>
|
|||
|
аякс <type 'str'>
|
|||
|
білінг <type 'str'>
|
|||
|
веб <type 'str'>
|
|||
|
гала <type 'unicode'>
|
|||
|
ґанок <type 'str'>
|
|||
|
Гоша <class '__main__.Utf8'>
|
|||
|
Дар'я <class '__main__.Utf8'>
|
|||
|
Єва <type 'str'>
|
|||
|
Жужа <type 'unicode'>
|
|||
|
Іа <type 'str'>
|
|||
|
Їжа <type 'str'>
|
|||
|
Київ <type 'str'>
|
|||
|
лимонад <type 'str'>
|
|||
|
ложка <type 'str'>
|
|||
|
Матриця <type 'str'>
|
|||
|
проба <type 'str'>
|
|||
|
тест <type 'unicode'>
|
|||
|
шовк <type 'str'>
|
|||
|
Юляся <type 'str'>
|
|||
|
яблуко <type 'str'>
|
|||
|
|
|||
|
>>> a=Utf8("中文字")
|
|||
|
>>> L=list(a)
|
|||
|
>>> L
|
|||
|
['中', '文', '字']
|
|||
|
>>> a="".join(L)
|
|||
|
>>> print a
|
|||
|
中文字
|
|||
|
>>> type(a)
|
|||
|
<type 'str'>
|
|||
|
>>> a="中文字" # standard str type
|
|||
|
>>> L=list(a)
|
|||
|
>>> L
|
|||
|
['\\xe4', '\\xb8', '\\xad', '\\xe6', '\\x96', '\\x87', '\\xe5', '\\xad', '\\x97']
|
|||
|
>>> from string import maketrans
|
|||
|
>>> str_tab=maketrans('PRobe','12345')
|
|||
|
>>> unicode_tab={ord(u'П'):ord(u'Ж'),
|
|||
|
... ord(u'Р') : u'Ш',
|
|||
|
... ord(Utf8('о')) : None, # utf8.ord() is used
|
|||
|
... ord('б') : None, # -//-//-
|
|||
|
... ord(u'а') : u"中文字",
|
|||
|
... ord(u'Є') : Utf8('•').decode(), # only unicode type is supported
|
|||
|
... }
|
|||
|
>>> s.translate(unicode_tab).translate(str_tab, deletechars=' ')
|
|||
|
'ЖШ中文字•12345'
|
|||
|
"""
|
|||
|
import sys
|
|||
|
reload(sys)
|
|||
|
sys.setdefaultencoding("UTF-8")
|
|||
|
import doctest
|
|||
|
print("DOCTESTS STARTED...")
|
|||
|
doctest.testmod()
|
|||
|
print("DOCTESTS FINISHED")
|
|||
|
|
|||
|
doctests()
|