SP/web2py/scripts/extract_mssql_models.py

"""Create web2py model (python code) to represent MS SQL Server tables.
Features:
* Uses ANSI Standard INFORMATION_SCHEMA (might work with other RDBMS)
* Detects legacy "keyed" tables (not having an "id" PK)
* Handles 'funny' column names. web2py requires all column names be valid python identifiers. This script uses rname
*   for column names that have spaces or are otherwise invalid python identifiers.
* Connects directly to running databases, no need to do a SQL dump
* Handles notnull, unique and referential constraints
* Detects most common datatypes and default values
* Supports running from the command line as well as from an IDE's debug menu. See the COMMAND_LINE_MODE constant below
*   for more info.

Requirements:
* Needs pyodbc python connector

Created by Kyle Flanagan. Based on a script by Mariano Reingart which was
based on a script to "generate schemas from dbs" (mysql) by Alexandre Andrade
"""

_author__ = "Kyle Flanagan <kyleflanagan@gmail.com>"

HELP = """
USAGE: extract_mssql_models db host port user passwd
Call with SQL Server database connection parameters,
web2py model will be printed on standard output.
EXAMPLE: python extract_mssql_models.py mydb localhost 3306 kflanaga pass
or
python extract_mssql_models.py mydb localhost 3306 kflanaga pass > db_model.py
"""

# Config options
DEBUG = False  # print debug messages to STDERR
SCHEMA = 'dbo'
COMMAND_LINE_MODE = True  # running from command prompt. Disable to specify variables and use in IDE
# Only specify values below if not running from command line
DB = None
HOST = None
USER = None
PASSWD = None
PORT = None

# Constant for Field keyword parameter order (and filter):
KWARGS = ('type', 'length', 'default', 'required', 'ondelete',
          'notnull', 'unique', 'label', 'comment', 'rname')

import sys
import re
# This is from pydal/helpers/regex.py as of 2016-06-16
# Use this to recognize if a field name need to have an rname representation
REGEX_VALID_TB_FLD = re.compile(r'^[^\d_][_0-9a-zA-Z]*\Z')
# For replacing invalid characters in field names
INVALID_CHARS = re.compile(r'[^a-zA-Z0-9_]')


def get_valid_column_name(field):
    """Return a valid column name that follows Python's rules for identifiers, which is what web2py requires for column
    names. Replaces invalid characters with underscores and leading digits with their associated English word."""
    if not REGEX_VALID_TB_FLD.match(field):
        # If the first character is a digit, replace it with its word counterpart
        if re.match(r'^[0-9]', field):
            numbers = ['Zero', 'One', 'Two', 'Three', 'Four',
                       'Five', 'Six', 'Seven', 'Eight', 'Nine']
            field = numbers[int(field[0])] + field[1:]

        field = INVALID_CHARS.sub('_', field)
    return field


def query(conn, sql, *args):
    "Execute a SQL query and return rows as a list of dicts"
    cur = conn.cursor()
    ret = []
    try:
        if DEBUG: print >> sys.stderr, "QUERY: ", sql % args
        cur.execute(sql % args)
        for row in cur:
            dic = {}
            for i, value in enumerate(row):
                field = cur.description[i][0]
                dic[field] = value
            if DEBUG: print >> sys.stderr, "RET: ", dic
            ret.append(dic)
        return ret
    finally:
        cur.close()


def get_tables(conn, schema=SCHEMA):
    "List table names in a given schema"
    rows = query(conn, """SELECT table_name FROM information_schema.tables
        WHERE table_schema = '%s'
        ORDER BY table_name""", schema)
    return [row['table_name'] for row in rows]


def get_fields(conn, table):
    "Retrieve field list for a given table"
    if DEBUG: print >> sys.stderr, "Processing TABLE", table
    rows = query(conn, """
        SELECT column_name, data_type,
            is_nullable,
            character_maximum_length,
            numeric_precision, numeric_precision_radix, numeric_scale,
            column_default
        FROM information_schema.columns
        WHERE table_name='%s'
        ORDER BY ordinal_position""", table)
    return rows


def define_field(conn, table, field, pks):
    "Determine field type, default value, references, etc."
    f = {}
    ref = references(conn, table, field['column_name'])
    if ref:
        f.update(ref)
    elif field['column_default'] and \
            field['column_default'].startswith("nextval") and \
                    field['column_name'] in pks:
        f['type'] = "'id'"
    elif field['data_type'].startswith('character'):
        f['type'] = "'string'"
        if field['character_maximum_length']:
            f['length'] = field['character_maximum_length']
    elif field['data_type'] in ('text', 'ntext'):
        f['type'] = "'text'"
    elif field['data_type'] in ('boolean', 'bit'):
        f['type'] = "'boolean'"
    elif field['data_type'] in ('tinyint', 'smallint', 'bigint', 'int'):
        f['type'] = "'integer'"
    elif field['data_type'] in ('real', 'float'):
        f['type'] = "'double'"
    elif field['data_type'] in ('datetime', 'datetime2', 'smalldatetime'):
        f['type'] = "'datetime'"
    elif field['data_type'] in ('timestamp',):
        f['type'] = "'datetime'"
        f['default'] = "request.now"
        f['update'] = "request.now"
    elif field['data_type'] in ('date',):
        f['type'] = "'date'"
    elif field['data_type'] in ('time',):
        f['type'] = "'time'"
    elif field['data_type'] in ('numeric', 'money', 'smallmoney', 'decimal'):
        f['type'] = "'decimal'"
        f['precision'] = field['numeric_precision']
        f['scale'] = field['numeric_scale'] or 0
    elif field['data_type'] in ('binary', 'varbinary', 'image'):
        f['type'] = "'blob'"
    elif field['data_type'] in ('point', 'lseg', 'polygon', 'unknown', 'USER-DEFINED', 'sql_variant'):
        f['type'] = ""  # unsupported?
    elif field['data_type'] in ('varchar', 'char', 'nchar', 'nvarchar', 'uniqueidentifer'):
        f['type'] = "'string'"
    else:
        raise RuntimeError("Data Type not supported: %s " % str(field))

    try:
        if field['column_default']:
            if field['column_default'] == "now()":
                d = "request.now"
            elif field['column_default'] == "true":
                d = "True"
            elif field['column_default'] == "false":
                d = "False"
            else:
                d = repr(eval(field['column_default']))
            f['default'] = str(d)
    except (ValueError, SyntaxError):
        pass
    except Exception, e:
        raise RuntimeError("Default unsupported '%s'" % field['column_default'])

    if not field['is_nullable']:
        f['notnull'] = "True"

    # For field names that are not valid python identifiers, we need to add a reference to their actual name
    # in the back end database
    if not REGEX_VALID_TB_FLD.match(field['column_name']):
        f['rname'] = "'[%s]'" % field['column_name']

    return f


def is_unique(conn, table, field):
    "Find unique columns (incomplete support)"
    rows = query(conn, """
        SELECT c.column_name
        FROM information_schema.table_constraints t
        INNER JOIN information_schema.constraint_column_usage c
        ON (t.CONSTRAINT_CATALOG =    c.CONSTRAINT_CATALOG
            AND t.CONSTRAINT_NAME =   c.CONSTRAINT_NAME
            AND t.CONSTRAINT_SCHEMA = c.CONSTRAINT_SCHEMA
            AND t.TABLE_CATALOG =     c.TABLE_CATALOG
            AND t.TABLE_NAME =        c.TABLE_NAME
            AND t.TABLE_SCHEMA =      c.TABLE_SCHEMA)
        WHERE t.table_name='%s'
          AND c.column_name='%s'
          AND t.constraint_type='UNIQUE'
        ;""", table, field['column_name'])
    return rows and True or False


def primarykeys(conn, table):
    "Find primary keys"
    rows = query(conn, """
        SELECT c.column_name
        FROM information_schema.table_constraints t
        INNER JOIN information_schema.constraint_column_usage c
                ON (t.CONSTRAINT_CATALOG =    c.CONSTRAINT_CATALOG
            AND t.CONSTRAINT_NAME =   c.CONSTRAINT_NAME
            AND t.CONSTRAINT_SCHEMA = c.CONSTRAINT_SCHEMA
            AND t.TABLE_CATALOG =     c.TABLE_CATALOG
            AND t.TABLE_NAME =        c.TABLE_NAME
            AND t.TABLE_SCHEMA =      c.TABLE_SCHEMA)
        WHERE t.table_name='%s'
          AND t.constraint_type='PRIMARY KEY'
        ;""", table)
    return [row['column_name'] for row in rows]


def references(conn, table, field):
    "Find a FK (fails if multiple)"
    rows1 = query(conn, """
        SELECT k.table_name, k.column_name, k.constraint_name,
               r.update_rule, r.delete_rule, k.ordinal_position
        FROM information_schema.key_column_usage k
        INNER JOIN information_schema.referential_constraints r
        ON (k.CONSTRAINT_CATALOG =    r.CONSTRAINT_CATALOG
            AND k.CONSTRAINT_NAME =   r.CONSTRAINT_NAME
            AND k.CONSTRAINT_SCHEMA = r.CONSTRAINT_SCHEMA)
        INNER JOIN information_schema.table_constraints t
        ON (r.CONSTRAINT_CATALOG =    t.CONSTRAINT_CATALOG
            AND r.CONSTRAINT_NAME =   t.CONSTRAINT_NAME
            AND r.CONSTRAINT_SCHEMA = t.CONSTRAINT_SCHEMA)

        WHERE k.table_name='%s'
          AND k.column_name='%s'
          AND t.constraint_type='FOREIGN KEY'
          ;""", table, field)
    if len(rows1) == 1:
        rows2 = query(conn, """
            SELECT table_name, column_name, *
            FROM information_schema.constraint_column_usage
            WHERE constraint_name='%s'
            """, rows1[0]['constraint_name'])
        row = None
        if len(rows2) > 1:
            row = rows2[int(rows1[0]['ordinal_position']) - 1]
            keyed = True
        if len(rows2) == 1:
            row = rows2[0]
            keyed = False
        if row:
            if keyed:  # THIS IS BAD, DON'T MIX "id" and primarykey!!!
                ref = {'type': "'reference %s.%s'" % (row['table_name'],
                                                      row['column_name'])}
            else:
                ref = {'type': "'reference %s'" % (row['table_name'],)}
            if rows1[0]['delete_rule'] != "NO ACTION":
                ref['ondelete'] = repr(rows1[0]['delete_rule'])
            return ref
        elif rows2:
            raise RuntimeError("Unsupported foreign key reference: %s" %
                               str(rows2))

    elif rows1:
        raise RuntimeError("Unsupported referential constraint: %s" %
                           str(rows1))


def define_table(conn, table):
    "Output single table definition"
    fields = get_fields(conn, table)
    pks = primarykeys(conn, table)
    print "db.define_table('%s'," % (table,)
    for field in fields:
        fname = field['column_name']
        fdef = define_field(conn, table, field, pks)
        if fname not in pks and is_unique(conn, table, field):
            fdef['unique'] = "True"
        if fdef['type'] == "'id'" and fname in pks:
            pks.pop(pks.index(fname))
        print "    Field('%s', %s)," % (get_valid_column_name(fname),
                                        ', '.join(["%s=%s" % (k, fdef[k]) for k in KWARGS
                                                   if k in fdef and fdef[k]]))
    if pks:
        print "    primarykey=[%s]," % ", ".join(["'%s'" % pk for pk in pks])
    print     "    migrate=migrate)"
    print


def define_db(conn, db, host, port, user, passwd):
    "Output database definition (model)"
    dal = 'db = DAL("mssql4://%s:%s@%s:%s/%s", pool_size=10, decode_credentials=True)'
    print dal % (
        user.replace('@', '%40').replace(':', '%3A'), passwd.replace('@', '%40').replace(':', '%3A'), host, port, db)
    print
    print "migrate = False"
    print
    for table in get_tables(conn):
        define_table(conn, table)


if __name__ == "__main__":
    # Parse arguments from command line:
    if len(sys.argv) < 6 and COMMAND_LINE_MODE:
        print HELP
    else:
        # Parse arguments from command line:
        if COMMAND_LINE_MODE:
            db, host, port, user, passwd = sys.argv[1:6]
        else:
            db = DB
            host = HOST
            user = USER
            passwd = PASSWD
            port = PORT

        # Make the database connection (change driver if required)
        import pyodbc
        # cnn = pyodbc.connect(database=db, host=host, port=port,
        #                        user=user, password=passwd,
        #                        )
        cnn = pyodbc.connect(
            r'DRIVER={{SQL Server Native Client 11.0}};SERVER={server};PORT={port};DATABASE={db};UID={user};PWD={passwd}'.format(
                server=host, port=port, db=db, user=user, passwd=passwd)
        )
        # Start model code generation:
        define_db(cnn, db, host, port, user, passwd)