SP/web2py/gluon/contrib/autolinks.py

"""
Developed by Massimo Di Pierro
Released under the web2py license (LGPL)

What does it do?

if html is a variable containing HTML text and urls in the text, when you call

    html = expend_html(html)

it automatically converts the url to links but when possible it embeds the object being linked.
In particular it can embed images, videos, audio files, documents (it uses the google code player),
as well as pages to a oembed service.


Google Doc Support
==================
Microsoft Word (.DOC, .DOCX)
Microsoft Excel (.XLS and .XLSX)
Microsoft PowerPoint 2007 / 2010 (.PPTX)
Apple Pages (.PAGES)
Adobe PDF (.PDF)
Adobe Illustrator (.AI)
Adobe Photoshop (.PSD)
Autodesk AutoCad (.DXF)
Scalable Vector Graphics (.SVG)
PostScript (.EPS, .PS)
TrueType (.TTF)
XML Paper Specification (.XPS)

Oembed Support
==============
flickr.com
youtube.com
hulu.com
vimeo.com
slideshare.net
qik.com
polleverywhere.com
wordpress.com
revision3.com
viddler.com
"""
from __future__ import print_function
from gluon._compat import FancyURLopener, urllib_quote

import re
import cgi
import sys
from json import loads
import urllib
import uuid
try:
    from BeautifulSoup import BeautifulSoup, Comment
    have_soup = True
except ImportError:
    have_soup = False

regex_link = re.compile('https?://\S+')

EMBED_MAPS = [
    (re.compile('http://\S*?flickr.com/\S*'),
     'http://www.flickr.com/services/oembed/'),
    (re.compile('http://\S*.youtu(\.be|be\.com)/watch\S*'),
     'http://www.youtube.com/oembed'),
    (re.compile('http://www.hulu.com/watch/\S*'),
     'http://www.hulu.com/api/oembed.json'),
    (re.compile('http://vimeo.com/\S*'),
     'http://vimeo.com/api/oembed.json'),
    (re.compile('http://www.slideshare.net/[^\/]+/\S*'),
     'http://www.slideshare.net/api/oembed/2'),
    (re.compile('http://qik.com/\S*'),
     'http://qik.com/api/oembed.json'),
    (re.compile('http://www.polleverywhere.com/\w+/\S+'),
     'http://www.polleverywhere.com/services/oembed/'),
    (re.compile('http://\S+.wordpress.com/\S+'),
     'http://public-api.wordpress.com/oembed/'),
    (re.compile('http://*.revision3.com/\S+'),
     'http://revision3.com/api/oembed/'),
    (re.compile('http://\S+.viddler.com/\S+'),
     'http://lab.viddler.com/services/oembed/'),
]


def image(url):
    return '<img src="%s" style="max-width:100%%"/>' % url


def audio(url):
    return '<audio controls="controls" style="max-width:100%%"><source src="%s" /></audio>' % url


def video(url):
    return '<video controls="controls" style="max-width:100%%"><source src="%s" /></video>' % url


def googledoc_viewer(url):
    return '<iframe src="https://docs.google.com/viewer?url=%s&embedded=true" style="max-width:100%%"></iframe>' % urllib_quote(url)


def web2py_component(url):
    code = str(uuid.uuid4())
    return '<div id="%s"></div><script>\nweb2py_component("%s","%s");\n</script>' % (code, url, code)

EXTENSION_MAPS = {
    'png': image,
    'gif': image,
    'jpg': image,
    'jpeg': image,
    'wav': audio,
    'ogg': audio,
    'mp3': audio,
    'mov': video,
    'mpe': video,
    'mp4': video,
    'mpg': video,
    'mpg2': video,
    'mpeg': video,
    'mpeg4': video,
    'movie': video,
    'wmv': video,
    'load': web2py_component,
    'pdf': googledoc_viewer,
    'doc': googledoc_viewer,
    'docx': googledoc_viewer,
    'ppt': googledoc_viewer,
    'pptx': googledoc_viewer,
    'xls': googledoc_viewer,
    'xlsx': googledoc_viewer,
    'pages': googledoc_viewer,
    'ai': googledoc_viewer,
    'psd': googledoc_viewer,
    'xdf': googledoc_viewer,
    'svg': googledoc_viewer,
    'ttf': googledoc_viewer,
    'xps': googledoc_viewer,
}


class VimeoURLOpener(FancyURLopener):
    "Vimeo blocks the urllib user agent for some reason"
    version = "Mozilla/4.0"
urllib._urlopener = VimeoURLOpener()


def oembed(url):
    for k, v in EMBED_MAPS:
        if k.match(url):
            oembed = v + '?format=json&url=' + cgi.escape(url)
            try:
                data = urllib.urlopen(oembed).read()
                return loads(data)  # json!
            except:
                pass
    return {}


def extension(url):
    return url.split('?')[0].split('.')[-1].lower()


def expand_one(url, cdict):
    # try ombed but first check in cache
    if '@' in url and not '://'in url:
        return '<a href="mailto:%s">%s</a>' % (url, url)
    if cdict and url in cdict:
        r = cdict[url]
    else:
        r = oembed(url)
        if isinstance(cdict, dict):
            cdict[url] = r
    # if oembed service
    if 'html' in r:
        html = r['html'].encode('utf8')
        if html.startswith('<object'):
            return '<embed style="max-width:100%%">%s</embed>' % html
        else:
            return html
    elif 'url' in r:
        url = r['url'].encode('utf8')
    # embed images, video, audio files
    ext = extension(url)
    if ext in EXTENSION_MAPS:
        return EXTENSION_MAPS[ext](url)
    # else regular link
    return '<a href="%(u)s">%(u)s</a>' % dict(u=url)


def expand_html(html, cdict=None):
    if not have_soup:
        raise RuntimeError("Missing BeautifulSoup")
    soup = BeautifulSoup(html)
    comments = soup.findAll(text=lambda text: isinstance(text, Comment))
    [comment.extract() for comment in comments]
    for txt in soup.findAll(text=True):
        if not txt.parent.name in ('a', 'script', 'pre', 'code', 'embed', 'object', 'audio', 'video'):
            ntxt = regex_link.sub(
                lambda match: expand_one(match.group(0), cdict), txt)
            txt.replaceWith(BeautifulSoup(ntxt))
    return str(soup)


def test():
    example = """
<h3>Fringilla nisi parturient nullam</h3>
<p>http://www.youtube.com/watch?v=IWBFiI5RrA0</p>
<p>http://www.web2py.com/examples/static/images/logo_bw.png</p>
<p>http://www.web2py.com/examples/default/index.load</p>
<p>http://www.web2py.com/examples/static/web2py_manual_cutl.pdf</p>
<p>Elementum sodales est varius magna leo sociis erat. Nascetur pretium non
ultricies gravida. Condimentum at nascetur tempus. Porttitor viverra ipsum
accumsan neque aliquet. Ultrices vestibulum tempor quisque eget sem eget.
Ornare malesuada tempus dolor dolor magna consectetur. Nisl dui non curabitur
laoreet tortor.</p>
"""
    return expand_html(example)

if __name__ == "__main__":
    if len(sys.argv) > 1:
        print(expand_html(open(sys.argv[1]).read()))
    else:
        print(test())
Add. 2018-10-25 15:33:07 +00:00			`"""`
			`Developed by Massimo Di Pierro`
			`Released under the web2py license (LGPL)`

			`What does it do?`

			`if html is a variable containing HTML text and urls in the text, when you call`

			`html = expend_html(html)`

			`it automatically converts the url to links but when possible it embeds the object being linked.`
			`In particular it can embed images, videos, audio files, documents (it uses the google code player),`
			`as well as pages to a oembed service.`


			`Google Doc Support`
			`==================`
			`Microsoft Word (.DOC, .DOCX)`
			`Microsoft Excel (.XLS and .XLSX)`
			`Microsoft PowerPoint 2007 / 2010 (.PPTX)`
			`Apple Pages (.PAGES)`
			`Adobe PDF (.PDF)`
			`Adobe Illustrator (.AI)`
			`Adobe Photoshop (.PSD)`
			`Autodesk AutoCad (.DXF)`
			`Scalable Vector Graphics (.SVG)`
			`PostScript (.EPS, .PS)`
			`TrueType (.TTF)`
			`XML Paper Specification (.XPS)`

			`Oembed Support`
			`==============`
			`flickr.com`
			`youtube.com`
			`hulu.com`
			`vimeo.com`
			`slideshare.net`
			`qik.com`
			`polleverywhere.com`
			`wordpress.com`
			`revision3.com`
			`viddler.com`
			`"""`
			`from __future__ import print_function`
			`from gluon._compat import FancyURLopener, urllib_quote`

			`import re`
			`import cgi`
			`import sys`
			`from json import loads`
			`import urllib`
			`import uuid`
			`try:`
			`from BeautifulSoup import BeautifulSoup, Comment`
			`have_soup = True`
			`except ImportError:`
			`have_soup = False`

			`regex_link = re.compile('https?://\S+')`

			`EMBED_MAPS = [`
			`(re.compile('http://\S?flickr.com/\S'),`
			`'http://www.flickr.com/services/oembed/'),`
			`(re.compile('http://\S.youtu(\.be\|be\.com)/watch\S'),`
			`'http://www.youtube.com/oembed'),`
			`(re.compile('http://www.hulu.com/watch/\S*'),`
			`'http://www.hulu.com/api/oembed.json'),`
			`(re.compile('http://vimeo.com/\S*'),`
			`'http://vimeo.com/api/oembed.json'),`
			`(re.compile('http://www.slideshare.net/[^\/]+/\S*'),`
			`'http://www.slideshare.net/api/oembed/2'),`
			`(re.compile('http://qik.com/\S*'),`
			`'http://qik.com/api/oembed.json'),`
			`(re.compile('http://www.polleverywhere.com/\w+/\S+'),`
			`'http://www.polleverywhere.com/services/oembed/'),`
			`(re.compile('http://\S+.wordpress.com/\S+'),`
			`'http://public-api.wordpress.com/oembed/'),`
			`(re.compile('http://*.revision3.com/\S+'),`
			`'http://revision3.com/api/oembed/'),`
			`(re.compile('http://\S+.viddler.com/\S+'),`
			`'http://lab.viddler.com/services/oembed/'),`
			`]`


			`def image(url):`
			`return '<img src="%s" style="max-width:100%%"/>' % url`


			`def audio(url):`
			`return '<audio controls="controls" style="max-width:100%%"><source src="%s" /></audio>' % url`


			`def video(url):`
			`return '<video controls="controls" style="max-width:100%%"><source src="%s" /></video>' % url`


			`def googledoc_viewer(url):`
			`return '<iframe src="https://docs.google.com/viewer?url=%s&embedded=true" style="max-width:100%%"></iframe>' % urllib_quote(url)`


			`def web2py_component(url):`
			`code = str(uuid.uuid4())`
			`return '<div id="%s"></div><script>\nweb2py_component("%s","%s");\n</script>' % (code, url, code)`

			`EXTENSION_MAPS = {`
			`'png': image,`
			`'gif': image,`
			`'jpg': image,`
			`'jpeg': image,`
			`'wav': audio,`
			`'ogg': audio,`
			`'mp3': audio,`
			`'mov': video,`
			`'mpe': video,`
			`'mp4': video,`
			`'mpg': video,`
			`'mpg2': video,`
			`'mpeg': video,`
			`'mpeg4': video,`
			`'movie': video,`
			`'wmv': video,`
			`'load': web2py_component,`
			`'pdf': googledoc_viewer,`
			`'doc': googledoc_viewer,`
			`'docx': googledoc_viewer,`
			`'ppt': googledoc_viewer,`
			`'pptx': googledoc_viewer,`
			`'xls': googledoc_viewer,`
			`'xlsx': googledoc_viewer,`
			`'pages': googledoc_viewer,`
			`'ai': googledoc_viewer,`
			`'psd': googledoc_viewer,`
			`'xdf': googledoc_viewer,`
			`'svg': googledoc_viewer,`
			`'ttf': googledoc_viewer,`
			`'xps': googledoc_viewer,`
			`}`


			`class VimeoURLOpener(FancyURLopener):`
			`"Vimeo blocks the urllib user agent for some reason"`
			`version = "Mozilla/4.0"`
			`urllib._urlopener = VimeoURLOpener()`


			`def oembed(url):`
			`for k, v in EMBED_MAPS:`
			`if k.match(url):`
			`oembed = v + '?format=json&url=' + cgi.escape(url)`
			`try:`
			`data = urllib.urlopen(oembed).read()`
			`return loads(data) # json!`
			`except:`
			`pass`
			`return {}`


			`def extension(url):`
			`return url.split('?')[0].split('.')[-1].lower()`


			`def expand_one(url, cdict):`
			`# try ombed but first check in cache`
			`if '@' in url and not '://'in url:`
			`return '<a href="mailto:%s">%s</a>' % (url, url)`
			`if cdict and url in cdict:`
			`r = cdict[url]`
			`else:`
			`r = oembed(url)`
			`if isinstance(cdict, dict):`
			`cdict[url] = r`
			`# if oembed service`
			`if 'html' in r:`
			`html = r['html'].encode('utf8')`
			`if html.startswith('<object'):`
			`return '<embed style="max-width:100%%">%s</embed>' % html`
			`else:`
			`return html`
			`elif 'url' in r:`
			`url = r['url'].encode('utf8')`
			`# embed images, video, audio files`
			`ext = extension(url)`
			`if ext in EXTENSION_MAPS:`
			`return EXTENSION_MAPS[ext](url)`
			`# else regular link`
			`return '<a href="%(u)s">%(u)s</a>' % dict(u=url)`


			`def expand_html(html, cdict=None):`
			`if not have_soup:`
			`raise RuntimeError("Missing BeautifulSoup")`
			`soup = BeautifulSoup(html)`
			`comments = soup.findAll(text=lambda text: isinstance(text, Comment))`
			`[comment.extract() for comment in comments]`
			`for txt in soup.findAll(text=True):`
			`if not txt.parent.name in ('a', 'script', 'pre', 'code', 'embed', 'object', 'audio', 'video'):`
			`ntxt = regex_link.sub(`
			`lambda match: expand_one(match.group(0), cdict), txt)`
			`txt.replaceWith(BeautifulSoup(ntxt))`
			`return str(soup)`


			`def test():`
			`example = """`
			`<h3>Fringilla nisi parturient nullam</h3>`
			`<p>http://www.youtube.com/watch?v=IWBFiI5RrA0</p>`
			`<p>http://www.web2py.com/examples/static/images/logo_bw.png</p>`
			`<p>http://www.web2py.com/examples/default/index.load</p>`
			`<p>http://www.web2py.com/examples/static/web2py_manual_cutl.pdf</p>`
			`<p>Elementum sodales est varius magna leo sociis erat. Nascetur pretium non`
			`ultricies gravida. Condimentum at nascetur tempus. Porttitor viverra ipsum`
			`accumsan neque aliquet. Ultrices vestibulum tempor quisque eget sem eget.`
			`Ornare malesuada tempus dolor dolor magna consectetur. Nisl dui non curabitur`
			`laoreet tortor.</p>`
			`"""`
			`return expand_html(example)`

			`if __name__ == "__main__":`
			`if len(sys.argv) > 1:`
			`print(expand_html(open(sys.argv[1]).read()))`
			`else:`
			`print(test())`