SP/web2py/gluon/contrib/redis_scheduler.py

801 lines
31 KiB
Python
Raw Normal View History

2018-10-25 15:33:07 +00:00
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
| This file is part of the web2py Web Framework
| Created by niphlod@gmail.com
| License: LGPLv3 (http://www.gnu.org/licenses/lgpl.html)
Scheduler with redis backend
---------------------------------
"""
from __future__ import print_function
import os
import time
import socket
import datetime
import logging
from json import loads, dumps
from gluon.utils import web2py_uuid
from gluon.storage import Storage
from gluon.scheduler import *
from gluon.scheduler import _decode_dict
from gluon.contrib.redis_utils import RWatchError
USAGE = """
## Example
For any existing app
Create File: app/models/scheduler.py ======
from gluon.contrib.redis_utils import RConn
from gluon.contrib.redis_scheduler import RScheduler
def demo1(*args,**vars):
print('you passed args=%s and vars=%s' % (args, vars))
return 'done!'
def demo2():
1/0
rconn = RConn()
mysched = RScheduler(db, dict(demo1=demo1,demo2=demo2), ...., redis_conn=rconn)
## run worker nodes with:
cd web2py
python web2py.py -K app
"""
path = os.getcwd()
if 'WEB2PY_PATH' not in os.environ:
os.environ['WEB2PY_PATH'] = path
IDENTIFIER = "%s#%s" % (socket.gethostname(), os.getpid())
logger = logging.getLogger('web2py.scheduler.%s' % IDENTIFIER)
POLLING = 'POLLING'
class RScheduler(Scheduler):
def __init__(self, db, tasks=None, migrate=True,
worker_name=None, group_names=None, heartbeat=HEARTBEAT,
max_empty_runs=0, discard_results=False, utc_time=False,
redis_conn=None, mode=1):
"""
Highly-experimental coordination with redis
Takes all args from Scheduler except redis_conn which
must be something closer to a StrictRedis instance.
My only regret - and the reason why I kept this under the hood for a
while - is that it's hard to hook up in web2py to something happening
right after the commit to a table, which will enable this version of the
scheduler to process "immediate" tasks right away instead of waiting a
few seconds (see FIXME in queue_task())
mode is reserved for future usage patterns.
Right now it moves the coordination (which is the most intensive
routine in the scheduler in matters of IPC) of workers to redis.
I'd like to have incrementally redis-backed modes of operations,
such as e.g.:
- 1: IPC through redis (which is the current implementation)
- 2: Store task results in redis (which will relieve further pressure
from the db leaving the scheduler_run table empty and possibly
keep things smooth as tasks results can be set to expire
after a bit of time)
- 3: Move all the logic for storing and queueing tasks to redis
itself - which means no scheduler_task usage too - and use
the database only as an historical record-bookkeeping
(e.g. for reporting)
As usual, I'm eager to see your comments.
"""
Scheduler.__init__(self, db, tasks=tasks, migrate=migrate,
worker_name=worker_name, group_names=group_names,
heartbeat=heartbeat, max_empty_runs=max_empty_runs,
discard_results=discard_results, utc_time=utc_time)
self.r_server = redis_conn
from gluon import current
self._application = current.request.application or 'appname'
def _nkey(self, key):
"""Helper to restrict all keys to a namespace and track them."""
prefix = 'w2p:rsched:%s' % self._application
allkeys = '%s:allkeys' % prefix
newkey = "%s:%s" % (prefix, key)
self.r_server.sadd(allkeys, newkey)
return newkey
def prune_all(self):
"""Global housekeeping."""
all_keys = self._nkey('allkeys')
with self.r_server.pipeline() as pipe:
while True:
try:
pipe.watch('PRUNE_ALL')
while True:
k = pipe.spop(all_keys)
if k is None:
break
pipe.delete(k)
pipe.execute()
break
except RWatchError:
time.sleep(0.1)
continue
def dt2str(self, value):
return value.strftime('%Y-%m-%d %H:%M:%S')
def str2date(self, value):
return datetime.datetime.strptime(value, '%Y-%m-%d %H:%M:%S')
def send_heartbeat(self, counter):
"""
Workers coordination in redis.
It has evolved into something is not that easy.
Here we try to do what we need in a single transaction,
and retry that transaction if something goes wrong
"""
with self.r_server.pipeline() as pipe:
while True:
try:
pipe.watch('SEND_HEARTBEAT')
self.inner_send_heartbeat(counter, pipe)
pipe.execute()
self.adj_hibernation()
self.sleep()
break
except RWatchError:
time.sleep(0.1)
continue
def inner_send_heartbeat(self, counter, pipe):
"""
Do a few things in the "maintenance" thread.
Specifically:
- registers the workers
- accepts commands sent to workers (KILL, TERMINATE, PICK, DISABLED, etc)
- adjusts sleep
- saves stats
- elects master
- does "housecleaning" for dead workers
- triggers tasks assignment
"""
r_server = pipe
status_keyset = self._nkey('worker_statuses')
status_key = self._nkey('worker_status:%s' % (self.worker_name))
now = self.now()
mybackedstatus = r_server.hgetall(status_key)
if not mybackedstatus:
r_server.hmset(
status_key,
dict(
status=ACTIVE, worker_name=self.worker_name,
first_heartbeat=self.dt2str(now),
last_heartbeat=self.dt2str(now),
group_names=dumps(self.group_names), is_ticker=False,
worker_stats=dumps(self.w_stats))
)
r_server.sadd(status_keyset, status_key)
if not self.w_stats.status == POLLING:
self.w_stats.status = ACTIVE
self.w_stats.sleep = self.heartbeat
mybackedstatus = ACTIVE
else:
mybackedstatus = mybackedstatus['status']
if mybackedstatus == DISABLED:
# keep sleeping
self.w_stats.status = DISABLED
r_server.hmset(
status_key,
dict(last_heartbeat=self.dt2str(now),
worker_stats=dumps(self.w_stats))
)
elif mybackedstatus == TERMINATE:
self.w_stats.status = TERMINATE
logger.debug("Waiting to terminate the current task")
self.give_up()
elif mybackedstatus == KILL:
self.w_stats.status = KILL
self.die()
else:
if mybackedstatus == STOP_TASK:
logger.info('Asked to kill the current task')
self.terminate_process()
logger.info('........recording heartbeat (%s)',
self.w_stats.status)
r_server.hmset(
status_key,
dict(
last_heartbeat=self.dt2str(now), status=ACTIVE,
worker_stats=dumps(self.w_stats)
)
)
# newroutine
r_server.expire(status_key, self.heartbeat * 3 * 15)
self.w_stats.sleep = self.heartbeat # re-activating the process
if self.w_stats.status not in (RUNNING, POLLING):
self.w_stats.status = ACTIVE
self.do_assign_tasks = False
if counter % 5 == 0 or mybackedstatus == PICK:
try:
logger.info(
' freeing workers that have not sent heartbeat')
registered_workers = r_server.smembers(status_keyset)
allkeys = self._nkey('allkeys')
for worker in registered_workers:
w = r_server.hgetall(worker)
w = Storage(w)
if not w:
r_server.srem(status_keyset, worker)
logger.info('removing %s from %s', worker, allkeys)
r_server.srem(allkeys, worker)
continue
try:
self.is_a_ticker = self.being_a_ticker(pipe)
except:
pass
if self.w_stats.status in (ACTIVE, POLLING):
self.do_assign_tasks = True
if self.is_a_ticker and self.do_assign_tasks:
# I'm a ticker, and 5 loops passed without reassigning tasks,
# let's do that and loop again
if not self.db_thread:
logger.debug('thread building own DAL object')
self.db_thread = DAL(
self.db._uri, folder=self.db._adapter.folder)
self.define_tables(self.db_thread, migrate=False)
db = self.db_thread
self.wrapped_assign_tasks(db)
return None
except:
logger.error('Error assigning tasks')
def being_a_ticker(self, pipe):
"""
Elects a ticker.
This is slightly more convoluted than the original
but if far more efficient
"""
r_server = pipe
status_keyset = self._nkey('worker_statuses')
registered_workers = r_server.smembers(status_keyset)
ticker = None
all_active = []
all_workers = []
for worker in registered_workers:
w = r_server.hgetall(worker)
if w['worker_name'] != self.worker_name and w['status'] == ACTIVE:
all_active.append(w)
if w['is_ticker'] == 'True' and ticker is None:
ticker = w
all_workers.append(w)
not_busy = self.w_stats.status in (ACTIVE, POLLING)
if not ticker:
if not_busy:
# only if this worker isn't busy, otherwise wait for a free one
for worker in all_workers:
key = self._nkey('worker_status:%s' % worker['worker_name'])
if worker['worker_name'] == self.worker_name:
r_server.hset(key, 'is_ticker', True)
else:
r_server.hset(key, 'is_ticker', False)
logger.info("TICKER: I'm a ticker")
else:
# giving up, only if I'm not alone
if len(all_active) > 1:
key = self._nkey('worker_status:%s' % (self.worker_name))
r_server.hset(key, 'is_ticker', False)
else:
not_busy = True
return not_busy
else:
logger.info(
"%s is a ticker, I'm a poor worker" % ticker['worker_name'])
return False
def assign_tasks(self, db):
"""
The real beauty.
We don't need to ASSIGN tasks, we just put
them into the relevant queue
"""
st, sd = db.scheduler_task, db.scheduler_task_deps
r_server = self.r_server
now = self.now()
status_keyset = self._nkey('worker_statuses')
with r_server.pipeline() as pipe:
while True:
try:
# making sure we're the only one doing the job
pipe.watch('ASSIGN_TASKS')
registered_workers = pipe.smembers(status_keyset)
all_workers = []
for worker in registered_workers:
w = pipe.hgetall(worker)
if w['status'] == ACTIVE:
all_workers.append(Storage(w))
pipe.execute()
break
except RWatchError:
time.sleep(0.1)
continue
# build workers as dict of groups
wkgroups = {}
for w in all_workers:
group_names = loads(w.group_names)
for gname in group_names:
if gname not in wkgroups:
wkgroups[gname] = dict(
workers=[{'name': w.worker_name, 'c': 0}])
else:
wkgroups[gname]['workers'].append(
{'name': w.worker_name, 'c': 0})
# set queued tasks that expired between "runs" (i.e., you turned off
# the scheduler): then it wasn't expired, but now it is
db(
(st.status.belongs((QUEUED, ASSIGNED))) &
(st.stop_time < now)
).update(status=EXPIRED)
# calculate dependencies
deps_with_no_deps = db(
(sd.can_visit == False) &
(~sd.task_child.belongs(
db(sd.can_visit == False)._select(sd.task_parent)
)
)
)._select(sd.task_child)
no_deps = db(
(st.status.belongs((QUEUED, ASSIGNED))) &
(
(sd.id == None) | (st.id.belongs(deps_with_no_deps))
)
)._select(st.id, distinct=True, left=sd.on(
(st.id == sd.task_parent) &
(sd.can_visit == False)
)
)
all_available = db(
(st.status.belongs((QUEUED, ASSIGNED))) &
(st.next_run_time <= now) &
(st.enabled == True) &
(st.id.belongs(no_deps))
)
limit = len(all_workers) * (50 / (len(wkgroups) or 1))
# let's freeze it up
db.commit()
x = 0
r_server = self.r_server
for group in wkgroups.keys():
queued_list = self._nkey('queued:%s' % group)
queued_set = self._nkey('queued_set:%s' % group)
# if are running, let's don't assign them again
running_list = self._nkey('running:%s' % group)
while True:
# the joys for rpoplpush!
t = r_server.rpoplpush(running_list, queued_list)
if not t:
# no more
break
r_server.sadd(queued_set, t)
tasks = all_available(st.group_name == group).select(
limitby=(0, limit), orderby = st.next_run_time)
# put tasks in the processing list
for task in tasks:
x += 1
gname = task.group_name
if r_server.sismember(queued_set, task.id):
# already queued, we don't put on the list
continue
r_server.sadd(queued_set, task.id)
r_server.lpush(queued_list, task.id)
d = dict(status=QUEUED)
if not task.task_name:
d['task_name'] = task.function_name
db(
(st.id == task.id) &
(st.status.belongs((QUEUED, ASSIGNED)))
).update(**d)
db.commit()
# I didn't report tasks but I'm working nonetheless!!!!
if x > 0:
self.w_stats.empty_runs = 0
self.w_stats.queue = x
self.w_stats.distribution = wkgroups
self.w_stats.workers = len(all_workers)
# I'll be greedy only if tasks queued are equal to the limit
# (meaning there could be others ready to be queued)
self.greedy = x >= limit
logger.info('TICKER: workers are %s', len(all_workers))
logger.info('TICKER: tasks are %s', x)
def pop_task(self, db):
"""Lift a task off a queue."""
r_server = self.r_server
st = self.db.scheduler_task
task = None
# ready to process something
for group in self.group_names:
queued_set = self._nkey('queued_set:%s' % group)
queued_list = self._nkey('queued:%s' % group)
running_list = self._nkey('running:%s' % group)
running_dict = self._nkey('running_dict:%s' % group)
self.w_stats.status = POLLING
# polling for 1 minute in total. If more groups are in,
# polling is 1 minute in total
logger.debug(' polling on %s', group)
task_id = r_server.brpoplpush(queued_list, running_list,
timeout=60 / len(self.group_names))
logger.debug(' finished polling')
self.w_stats.status = ACTIVE
if task_id:
r_server.hset(running_dict, task_id, self.worker_name)
r_server.srem(queued_set, task_id)
task = db(
(st.id == task_id) &
(st.status == QUEUED)
).select().first()
if not task:
r_server.lrem(running_list, 0, task_id)
r_server.hdel(running_dict, task_id)
r_server.lrem(queued_list, 0, task_id)
logger.error("we received a task that isn't there (%s)",
task_id)
return None
break
now = self.now()
if task:
task.update_record(status=RUNNING, last_run_time=now)
# noone will touch my task!
db.commit()
logger.debug(' work to do %s', task.id)
else:
logger.info('nothing to do')
return None
times_run = task.times_run + 1
if task.cronline:
cron_recur = CronParser(task.cronline, now.replace(second=0))
next_run_time = cron_recur.get_next()
elif not task.prevent_drift:
next_run_time = task.last_run_time + datetime.timedelta(
seconds=task.period
)
else:
# calc next_run_time based on available slots
# see #1191
next_run_time = task.start_time
secondspassed = self.total_seconds(now - next_run_time)
steps = secondspassed // task.period + 1
next_run_time += datetime.timedelta(seconds=task.period * steps)
if times_run < task.repeats or task.repeats == 0:
# need to run (repeating task)
run_again = True
else:
# no need to run again
run_again = False
run_id = 0
while True and not self.discard_results:
logger.debug(' new scheduler_run record')
try:
run_id = db.scheduler_run.insert(
task_id=task.id,
status=RUNNING,
start_time=now,
worker_name=self.worker_name)
db.commit()
break
except:
time.sleep(0.5)
db.rollback()
logger.info('new task %(id)s "%(task_name)s"'
' %(application_name)s.%(function_name)s' % task)
return Task(
app=task.application_name,
function=task.function_name,
timeout=task.timeout,
args=task.args, # in json
vars=task.vars, # in json
task_id=task.id,
run_id=run_id,
run_again=run_again,
next_run_time=next_run_time,
times_run=times_run,
stop_time=task.stop_time,
retry_failed=task.retry_failed,
times_failed=task.times_failed,
sync_output=task.sync_output,
uuid=task.uuid,
group_name=task.group_name)
def report_task(self, task, task_report):
"""
Override.
Needs it only because we need to pop from the
running tasks
"""
r_server = self.r_server
db = self.db
now = self.now()
st = db.scheduler_task
sr = db.scheduler_run
if not self.discard_results:
if task_report.result != 'null' or task_report.tb:
# result is 'null' as a string if task completed
# if it's stopped it's None as NoneType, so we record
# the STOPPED "run" anyway
logger.debug(' recording task report in db (%s)',
task_report.status)
db(sr.id == task.run_id).update(
status=task_report.status,
stop_time=now,
run_result=task_report.result,
run_output=task_report.output,
traceback=task_report.tb)
else:
logger.debug(' deleting task report in db because of no result')
db(sr.id == task.run_id).delete()
# if there is a stop_time and the following run would exceed it
is_expired = (task.stop_time and
task.next_run_time > task.stop_time and
True or False)
status = (task.run_again and is_expired and EXPIRED or
task.run_again and not is_expired and
QUEUED or COMPLETED)
if task_report.status == COMPLETED:
# assigned calculations
d = dict(status=status,
next_run_time=task.next_run_time,
times_run=task.times_run,
times_failed=0,
assigned_worker_name=self.worker_name
)
db(st.id == task.task_id).update(**d)
if status == COMPLETED:
self.update_dependencies(db, task.task_id)
else:
st_mapping = {'FAILED': 'FAILED',
'TIMEOUT': 'TIMEOUT',
'STOPPED': 'FAILED'}[task_report.status]
status = (task.retry_failed and
task.times_failed < task.retry_failed and
QUEUED or task.retry_failed == -1 and
QUEUED or st_mapping)
db(st.id == task.task_id).update(
times_failed=st.times_failed + 1,
next_run_time=task.next_run_time,
status=status,
assigned_worker_name=self.worker_name
)
logger.info('task completed (%s)', task_report.status)
running_list = self._nkey('running:%s' % task.group_name)
running_dict = self._nkey('running_dict:%s' % task.group_name)
r_server.lrem(running_list, 0, task.task_id)
r_server.hdel(running_dict, task.task_id)
def wrapped_pop_task(self):
"""Commodity function to call `pop_task` and trap exceptions.
If an exception is raised, assume it happened because of database
contention and retries `pop_task` after 0.5 seconds
"""
db = self.db
db.commit() # another nifty db.commit() only for Mysql
x = 0
while x < 10:
try:
rtn = self.pop_task(db)
return rtn
break
# this is here to "interrupt" any blrpoplpush op easily
except KeyboardInterrupt:
self.give_up()
break
except:
self.w_stats.errors += 1
db.rollback()
logger.error(' error popping tasks')
x += 1
time.sleep(0.5)
def get_workers(self, only_ticker=False):
"""Return a dict holding worker_name : {**columns}
representing all "registered" workers.
only_ticker returns only the worker running as a TICKER,
if there is any
"""
r_server = self.r_server
status_keyset = self._nkey('worker_statuses')
registered_workers = r_server.smembers(status_keyset)
all_workers = {}
for worker in registered_workers:
w = r_server.hgetall(worker)
w = Storage(w)
if not w:
continue
all_workers[w.worker_name] = Storage(
status=w.status,
first_heartbeat=self.str2date(w.first_heartbeat),
last_heartbeat=self.str2date(w.last_heartbeat),
group_names=loads(w.group_names, object_hook=_decode_dict),
is_ticker=w.is_ticker == 'True' and True or False,
worker_stats=loads(w.worker_stats, object_hook=_decode_dict)
)
if only_ticker:
for k, v in all_workers.iteritems():
if v['is_ticker']:
return {k: v}
return {}
return all_workers
def set_worker_status(self, group_names=None, action=ACTIVE,
exclude=None, limit=None, worker_name=None):
"""Internal function to set worker's status"""
r_server = self.r_server
all_workers = self.get_workers()
if not group_names:
group_names = self.group_names
elif isinstance(group_names, str):
group_names = [group_names]
exclusion = exclude and exclude.append(action) or [action]
workers = []
if worker_name is not None:
if worker_name in all_workers.keys():
workers = [worker_name]
else:
for k, v in all_workers.iteritems():
if v.status not in exclusion and set(group_names) & set(v.group_names):
workers.append(k)
if limit and worker_name is None:
workers = workers[:limit]
if workers:
with r_server.pipeline() as pipe:
while True:
try:
pipe.watch('SET_WORKER_STATUS')
for w in workers:
worker_key = self._nkey('worker_status:%s' % w)
pipe.hset(worker_key, 'status', action)
pipe.execute()
break
except RWatchError:
time.sleep(0.1)
continue
def queue_task(self, function, pargs=[], pvars={}, **kwargs):
"""
FIXME: immediate should put item in queue. The hard part is
that currently there are no hooks happening at post-commit time
Queue tasks. This takes care of handling the validation of all
parameters
Args:
function: the function (anything callable with a __name__)
pargs: "raw" args to be passed to the function. Automatically
jsonified.
pvars: "raw" kwargs to be passed to the function. Automatically
jsonified
kwargs: all the parameters available (basically, every
`scheduler_task` column). If args and vars are here, they should
be jsonified already, and they will override pargs and pvars
Returns:
a dict just as a normal validate_and_insert(), plus a uuid key
holding the uuid of the queued task. If validation is not passed
( i.e. some parameters are invalid) both id and uuid will be None,
and you'll get an "error" dict holding the errors found.
"""
if hasattr(function, '__name__'):
function = function.__name__
targs = 'args' in kwargs and kwargs.pop('args') or dumps(pargs)
tvars = 'vars' in kwargs and kwargs.pop('vars') or dumps(pvars)
tuuid = 'uuid' in kwargs and kwargs.pop('uuid') or web2py_uuid()
tname = 'task_name' in kwargs and kwargs.pop('task_name') or function
immediate = 'immediate' in kwargs and kwargs.pop('immediate') or None
cronline = kwargs.get('cronline')
kwargs.update(function_name=function,
task_name=tname,
args=targs,
vars=tvars,
uuid=tuuid)
if cronline:
try:
start_time = kwargs.get('start_time', self.now)
next_run_time = CronParser(cronline, start_time).get_next()
kwargs.update(start_time=start_time, next_run_time=next_run_time)
except:
pass
rtn = self.db.scheduler_task.validate_and_insert(**kwargs)
if not rtn.errors:
rtn.uuid = tuuid
if immediate:
r_server = self.r_server
ticker = self.get_workers(only_ticker=True)
if ticker.keys():
ticker = ticker.keys()[0]
with r_server.pipeline() as pipe:
while True:
try:
pipe.watch('SET_WORKER_STATUS')
worker_key = self._nkey('worker_status:%s' % ticker)
pipe.hset(worker_key, 'status', 'PICK')
pipe.execute()
break
except RWatchError:
time.sleep(0.1)
continue
else:
rtn.uuid = None
return rtn
def stop_task(self, ref):
"""Shortcut for task termination.
If the task is RUNNING it will terminate it, meaning that status
will be set as FAILED.
If the task is QUEUED, its stop_time will be set as to "now",
the enabled flag will be set to False, and the status to STOPPED
Args:
ref: can be
- an integer : lookup will be done by scheduler_task.id
- a string : lookup will be done by scheduler_task.uuid
Returns:
- 1 if task was stopped (meaning an update has been done)
- None if task was not found, or if task was not RUNNING or QUEUED
Note:
Experimental
"""
r_server = self.r_server
st = self.db.scheduler_task
if isinstance(ref, int):
q = st.id == ref
elif isinstance(ref, str):
q = st.uuid == ref
else:
raise SyntaxError(
"You can retrieve results only by id or uuid")
task = self.db(q).select(st.id, st.status, st.group_name)
task = task.first()
rtn = None
if not task:
return rtn
running_dict = self._nkey('running_dict:%s' % task.group_name)
if task.status == 'RUNNING':
worker_key = r_server.hget(running_dict, task.id)
worker_key = self._nkey('worker_status:%s' % (worker_key))
r_server.hset(worker_key, 'status', STOP_TASK)
elif task.status == 'QUEUED':
rtn = self.db(q).update(
stop_time=self.now(),
enabled=False,
status=STOPPED)
return rtn