Move to MySQL

author: Glenn Elliott <gelliott@cs.unc.edu> 2014-09-21 18:35:53 -0400
committer: Glenn Elliott <gelliott@cs.unc.edu> 2014-09-21 18:35:53 -0400
commit: 0d0de82d3994f3e737925e1e6a3e4d403375e529 (patch)
tree: f6d883c29fc61bf089a7232328841fc990306317
parent: d3efc4e3241fdc1d2ec8524a424a29895dc86a22 (diff)
3 files changed, 190 insertions, 136 deletions
diff --git a/rtss14/createtables.sql b/rtss14/createtables.sql
index 45490d3..a072aa4 100644
--- a/rtss14/createtables.sql
+++ b/rtss14/createtables.sql
@@ -1,7 +1,8 @@
 CREATE TABLE distrs(
-        id   INTEGER PRIMARY KEY NOT NULL,
+        id   INTEGER NOT NULL AUTO_INCREMENT,
-        name TEXT
+        name TEXT,
-);
+        PRIMARY KEY (id)
+) ENGINE=InnoDB;
 -- util
 INSERT INTO distrs(name) VALUES('u-uni-light');
 INSERT INTO distrs(name) VALUES('u-uni-medium');
@@ -37,12 +38,9 @@ INSERT INTO distrs(name) VALUES('c-const-light');
 INSERT INTO distrs(name) VALUES('c-const-medium');
 INSERT INTO distrs(name) VALUES('c-const-heavy');
--CREATE TABLE dummy(
--      pid INTEGER
--);
--INSERT INTO dummy(pid) VALUES(0);
 CREATE TABLE dp_pending(
+        id   INTEGER NOT NULL AUTO_INCREMENT,
        -- task set util cap
        ts_util          REAL,
@@ -51,6 +49,7 @@ CREATE TABLE dp_pending(
        cpu_cluster_size INTEGER,
        ngpu             INTEGER,
        gpu_cluster_size INTEGER,
+        is_release_master INTEGER,
        -- overheads config
        is_worst_case    INTEGER,
@@ -77,25 +76,27 @@ CREATE TABLE dp_pending(
        ncopy_engines    INTEGER,
        chunk_size       INTEGER,
+        PRIMARY KEY (id)
        -- every permutation marks a unique configuration
-        PRIMARY KEY(ts_util,
+        -- PRIMARY KEY(ts_util,
-                ncpu, cpu_cluster_size, ngpu, gpu_cluster_size,
+        --      ncpu, cpu_cluster_size, ngpu, gpu_cluster_size, is_release_master,
-                is_worst_case, is_polluters, wss_size,
+        --      is_worst_case, is_polluters, wss_size,
-                util_dist, period_dist, data_dist, state_dist, kernel_dist, cpu_dist,
+        --      util_dist, period_dist, data_dist, state_dist, kernel_dist, cpu_dist,
-                gpu_population,
+        --      gpu_population,
-                rho, is_dgl, is_p2p, ncopy_engines, chunk_size)
+        --      rho, is_dgl, is_p2p, ncopy_engines, chunk_size)
-);
+) ENGINE=InnoDB;
 -- partially tested dps
 CREATE TABLE dp_ptested(
        -- auto-computed unique id for this 
-        id INTEGER PRIMARY KEY NOT NULL,
+        id INTEGER NOT NULL AUTO_INCREMENT,
        -- platform parameters
        ncpu             INTEGER,
        cpu_cluster_size INTEGER,
        ngpu             INTEGER,
        gpu_cluster_size INTEGER,
+        is_release_master INTEGER,
        -- overheads config
        is_worst_case    INTEGER,
@@ -122,13 +123,14 @@ CREATE TABLE dp_ptested(
        ncopy_engines    INTEGER,
        chunk_size       INTEGER,
+        PRIMARY KEY (id)
        -- every permutation marks a unique configuration
-        UNIQUE(ncpu, cpu_cluster_size, ngpu, gpu_cluster_size,
+        -- UNIQUE(ncpu, cpu_cluster_size, ngpu, gpu_cluster_size, is_release_master,
-                is_worst_case, is_polluters, wss_size,
+        --      is_worst_case, is_polluters, wss_size,
-                util_dist, period_dist, data_dist, state_dist, kernel_dist, cpu_dist,
+        --      util_dist, period_dist, data_dist, state_dist, kernel_dist, cpu_dist,
-                gpu_population,
+        --      gpu_population,
-                rho, is_dgl, is_p2p, ncopy_engines, chunk_size)
+        --      rho, is_dgl, is_p2p, ncopy_engines, chunk_size)
-);
+) ENGINE=InnoDB;
 CREATE TABLE sched_results(
        dp            INTEGER,
@@ -145,7 +147,7 @@ CREATE TABLE sched_results(
        FOREIGN KEY(dp) REFERENCES dp_ptested(id),
        PRIMARY KEY(dp, ts_util)
-);
+) ENGINE=InnoDB;
 CREATE TABLE scaled_sched_results(
        dp            INTEGER,
@@ -161,4 +163,4 @@ CREATE TABLE scaled_sched_results(
        FOREIGN KEY(dp) REFERENCES dp_ptested(id),
        PRIMARY KEY(dp, eff_ts_util, scale_factor)
-);
+) ENGINE=InnoDB;
diff --git a/rtss14/database.py b/rtss14/database.py
index af6e966..cd5535d 100755
--- a/rtss14/database.py
+++ b/rtss14/database.py
@@ -5,8 +5,7 @@ import time
 import random
 import copy
 import itertools
-import sqlite3 as lite
+import MySQLdb as db
-import json
 from schedcat.util.storage import storage
 from generator import DesignPointGenerator
@@ -17,15 +16,13 @@ timeout = 10*60      # 10 minute timeout on database locks
 max_fail = 30        # maximum number of times to reset a bad db connection
 distr_mapper = None  # maps database distribution ids to strings
 dp_col_names = None  # list of keys to extract from sched results to write to db
+dp_col_type_strs = None
 ######################
 def backoff(t):
    time_to_sleep = random.random() * t
    time.sleep(time_to_sleep)
-def init_db(db_name):
-    lite.register_converter("JSON", json.loads)
 def init_distr_mapper(conn):
    global distr_mapper
    global timeout
@@ -34,7 +31,7 @@ def init_distr_mapper(conn):
        return
    distr_mapper = {}
-    c = conn.cursor()
+    c = conn.cursor(db.cursors.DictCursor)
    start = time.time()
    while True:
        try:
@@ -44,17 +41,19 @@ def init_distr_mapper(conn):
                distr_mapper[int(row['id'])] = str(row['name'])
                distr_mapper[str(row['name'])] = int(row['id'])
            break
-        except lite.OperationalError:
+        except db.OperationalError, e:
+            print e
            elapsed = time.time() - start
            if elapsed > timeout:
                raise
            else:
                backoff(10)
            pass
-            
+    c.close()
 def init_dp_col_names(conn):
    global dp_col_names
+    global dp_col_type_strs
    global timeout
    if dp_col_names is not None and len(dp_col_names) > 0:
@@ -65,16 +64,27 @@ def init_dp_col_names(conn):
    while True:
        # can't use pragma within transaction safely. so loop.
        try:
-            c.execute('PRAGMA table_info(dp_pending)')
+            c.execute('SHOW COLUMNS FROM dp_pending')
-            dp_col_names = list(map(lambda x: x[1].encode('ascii', 'ignore'), c.fetchall()))
+            cols = c.fetchall()
+            dp_col_names = []
+            dp_col_type_strs = {}
+            for field in cols:
+                name = field[0].encode('ascii', 'ignore')
+                if(name == 'id'):
+                    continue
+                dp_col_names.append(name)
+                form = '%s'
+                dp_col_type_strs[name] = form
            break
-        except lite.OperationalError:
+        except db.OperationalError, e:
+            print e
            elapsed = time.time() - start
            if elapsed > timeout:
                raise
            else:
                backoff(10)
            pass
+    c.close()
 def db_type(var):
    if type(var) is int or type(var) is bool:
@@ -108,6 +118,7 @@ def dp_to_db(dp):
    data['cpu_cluster_size'] = dp.ncpus / dp.nclusters
    data['ngpu'] = dp.ngpus
    data['gpu_cluster_size'] = dp.ngpus / dp.ngclusters
+    data['is_release_master'] = dp.release_master
    data['is_worst_case'] = 1 if dp.ovh_type == 'max' else 0
    data['is_polluters'] = dp.polluters
    data['wss_size'] = dp.wss
@@ -138,6 +149,7 @@ def db_to_dp(data):
    exp.nclusters = [exp.ncpus[0]/int(data['cpu_cluster_size'])]
    exp.ngpus = [int(data['ngpu'])]
    exp.ngclusters = [exp.ngpus[0]/int(data['gpu_cluster_size'])]
+    exp.release_master = [int(data['is_release_master'])]
    exp.polluters = [int(data['is_polluters']) != 0]
    exp.ovh_type = ['max' if int(data['is_worst_case']) == 1 else 'mean']
    exp.rho = [int(data['rho'])]
@@ -169,43 +181,44 @@ def dp_to_db_vals(dp):
        vals[key] = to_db_val(value)
    return vals
-def begin_sync(conn):
+def begin_sync(conn, c):
    # use a shorter timout because we'll retry with a new db connection instead
    transaction_timeout = 5*60
    start = time.time()
    while True:
        try:
-            conn.execute('BEGIN EXCLUSIVE TRANSACTION')
+            conn.autocommit(False)
-            # Force a write lock to be acquired before we do
+            c.execute('START TRANSACTION')
-            # anything more. I hate sqlite.
            break
-        except lite.OperationalError:
+        except db.OperationalError, e:
+            print e
            elapsed = time.time() - start
            if elapsed > transaction_timeout:
                raise
            else:
                backoff(20)
            pass
-    return conn
+    return c
 def end_sync(conn):
    success = conn.commit()
+    conn.autocommit(True)
    return success
-def connect_db(db_name, isolation = 'EXCLUSIVE'):
+def connect_db(db_name):
    global distr_mapper
    global dp_col_names
+    global dp_col_type_strs
    global timeout
-    init_db(db_name)
    start = time.time()
    while True:
        try:
-            conn = lite.connect(db_name, detect_types=lite.PARSE_DECLTYPES|lite.PARSE_COLNAMES, isolation_level=isolation)
+            conn = db.connect('mydb.cs.unc.edu', 'gelliott', 'G1ennDB', db_name);
            break
-        except lite.OperationalError:
+        except db.OperationalError, e:
+            print e
            elapsed = time.time() - start
            # just give up :(
            if elapsed > timeout:
@@ -213,9 +226,6 @@ def connect_db(db_name, isolation = 'EXCLUSIVE'):
            else:
                backoff(10)
            pass
-    conn.row_factory = lite.Row
-    # enable foreign keys
-    conn.execute('PRAGMA FOREIGN_KEYS=ON')
    if distr_mapper is None:
        init_distr_mapper(conn)
@@ -232,17 +242,18 @@ def clear_tables(db_name):
    with conn:
        c = conn.cursor()
        ################
-        begin_sync(conn)
+        begin_sync(conn, c)
        for t in tables:
            c.execute('DELETE FROM %s' % t)
        end_sync(conn)
+        c.close()
        ################
 #def get_results(conn, dp, fields = '*', table = 'sched_results', extra = None):
 #    where = ['%s=?' % k for k in dp.iterkeys()]
 #
 #    ################
-#    c = conn.cursor()
+#    c = conn.cursor(db.cursors.DictCursor)
 #    begin_sync(conn)
 #    if extra:
 #        query = "SELECT %s FROM %s WHERE %s %s" % (", ".join(fields), table, " AND ".join(where), extra)
@@ -258,7 +269,7 @@ def clear_tables(db_name):
 #    return data
 #def count_results(conn, dp, table = 'sched_results'):
-#    c = conn.cursor()
+#    c = conn.cursor(db.cursors.DictCursor)
 #    where = ['%s=?' % k for k in dp.iterkeys()]
 #
 #    query = "SELECT COUNT(*) FROM %s WHERE %s" % (table, " AND ".join(where))
@@ -272,35 +283,36 @@ def clear_tables(db_name):
 #
 #    return data[0]
-def __get_dp_id(conn, dp, add_if_missing = False):
+def __get_dp_id(c, dp, add_if_missing = False):
    global dp_col_names
+    global dp_col_type_strs
    seeking = dp_to_db(dp)
    seeking.pop('ts_util', None)
-    query = 'SELECT id FROM dp_ptested WHERE %s' % ' AND '.join(map(lambda c: '%s=?' % c, seeking.iterkeys()))
+    query = 'SELECT id FROM dp_ptested WHERE %s' % ' AND '.join(map(lambda x: '%s=%s' % (x, dp_col_type_strs[x]), seeking.iterkeys()))
-    c = conn.cursor()
    c.execute(query, tuple(seeking.values()))
-    data = c.fetchone()
+    row = c.fetchone()
-    if data:
+    if row:
-        return data[0]
+        return row['id']
    elif add_if_missing == False:
        return None
    # add the design point
    col_names = copy.deepcopy(dp_col_names)
    col_names.pop(0) # remove the id field
-    c.execute('INSERT INTO dp_ptested VALUES(NULL,%s)' % ','.join(['?']*len(col_names)),
+    c.execute('INSERT INTO dp_ptested VALUES(NULL,%s)' % ','.join(['%s']*len(col_names)),
        tuple(map(lambda x: seeking[x], col_names)))
    # get the assigned id
    c.execute(query, tuple(seeking.values()))
-    data = c.fetchone()
+    row = c.fetchone()
-    return data[0]
+    return row['id']
 def __already_pending(dp, conn):
    seeking = dp_to_db(dp)
    c = conn.cursor()
-    c.execute('SELECT COUNT(*) FROM dp_pending WHERE %s' % ' AND '.join(map(lambda x: '%s=?' % x, seeking.iterkeys())),
+    c.execute('SELECT COUNT(*) FROM dp_pending WHERE %s' % ' AND '.join(map(lambda x: '%s=%s' % (x, dp_col_type_strs[x]), seeking.iterkeys())),
            tuple(seeking.values()))
-    data = c.fetchone()
+    row = c.fetchone()
-    processed = bool(data[0])
+    processed = bool(row[0])
+    c.close()
    return processed
 def already_pending(dp, conn = None, db_name = None):
@@ -316,10 +328,11 @@ def __already_processed(dp, conn):
    seeking.pop('ts_util', None)
    c = conn.cursor()
    c.execute('SELECT COUNT(*) FROM sched_results as R JOIN dp_ptested as K on R.dp=K.id '
-              'WHERE R.ts_util=? AND %s' % ' AND '.join(map(lambda x: 'K.%s=?' % x, seeking.iterkeys())),
+              'WHERE R.ts_util=%%s AND %s' % ' AND '.join(map(lambda x: 'K.%s=%s' % (x, dp_col_type_strs[x]), seeking.iterkeys())),
            ((dp.sys_util,) + tuple(seeking.values())))
-    data = c.fetchone()
+    row = c.fetchone()
-    processed = bool(data[0])
+    processed = bool(row[0])
+    c.close()
    return processed
 def already_processed(dp, conn = None, db_name = None):
@@ -332,6 +345,7 @@ def already_processed(dp, conn = None, db_name = None):
 def store_design_points(db_name, dps, clean):
    global dp_col_names
+    global dp_col_type_strs
    conn = connect_db(db_name)
    npending = 0
@@ -345,22 +359,21 @@ def store_design_points(db_name, dps, clean):
            dps = [dp_to_db(dp) for dp in dps]
        ################
-        begin_sync(conn)
        # convert dicts into correctly-ordered list and insert
-        c.executemany('INSERT INTO dp_pending VALUES(%s)' % ','.join(['?']*len(dp_col_names)),
+        c.executemany('INSERT INTO dp_pending VALUES(NULL,%s)' % ','.join(['%s']*len(dp_col_names)),
            [tuple(map(lambda x: d[x], dp_col_names)) for d in dps])
        # complete
        c.execute('SELECT COUNT(*) FROM dp_pending')
        npending = c.fetchone()[0]
-        end_sync(conn)
        ##############
+        c.close()
    return len(dps), npending
 def get_design_points(db_name, ndp = 1):
    global dp_col_names
+    global dp_col_type_strs
    global max_fail
    fetched = []
@@ -380,16 +393,17 @@ def get_design_points(db_name, ndp = 1):
                conn = connect_db(db_name)
                with conn:
                    ################
-                    begin_sync(conn)
+                    c = conn.cursor(db.cursors.DictCursor)
-                    c = conn.cursor()
+                    begin_sync(conn, c)
-                    c.execute('SELECT %s FROM dp_pending LIMIT ?' % (','.join(dp_col_names)), (ndp,))
+#                    c.execute('LOCK TABLES dp_pending WRITE')
+                    c.execute('SELECT %s FROM dp_pending LIMIT %%s FOR UPDATE' % (','.join(dp_col_names)), (ndp,))
                    dps = [db_to_dp(d) for d in c.fetchall()]
                    nfetched = len(dps) if dps else 0
                    if nfetched > 0:
                        temp = [dp_to_db(d) for d in dps]
                        for i in xrange(nfetched):
                            c.execute('DELETE FROM dp_pending '
-                                      'WHERE %s' % ' AND '.join(map(lambda c: '%s=?'%c, dp_col_names)),
+                                      'WHERE %s' % ' AND '.join(map(lambda x: '%s=%s'%(x, dp_col_type_strs[x]), dp_col_names)),
                                      tuple(map(lambda x: temp[i][x], dp_col_names)))
                            if c.rowcount == 1:
                                fetched.append(dps[i])
@@ -399,11 +413,14 @@ def get_design_points(db_name, ndp = 1):
                                print 'deleted too many rows... dropping.'
                        if len(fetched) == 0:
                            retry = True
+#                    c.execute('UNLOCK TABLES')
                    end_sync(conn)
+                    c.close()
                    ##############
            # success!
            break
-        except lite.OperationalError:
+        except db.OperationalError, e:
+            print e
            # make sure the db connection is closed
            if conn is not None:
                conn.close()
@@ -416,9 +433,7 @@ def get_design_points(db_name, ndp = 1):
            else:
                time_to_sleep = (random.random()*20 + 5)*failcount
                time_to_sleep = min(time_to_sleep, 4*60)
-                # flush lustre cache???
                print 'retrying get_design_points from scratch. sleep = %.3f' % time_to_sleep
-                os.system('sync')
                time.sleep(time_to_sleep)
    if failcount > 0:
@@ -430,19 +445,16 @@ def get_design_points(db_name, ndp = 1):
        del conn
        conn = None
-    os.system('sync')
    return fetched
-def __store_eff_sched_results(conn, dp_id, stats):
+def __store_eff_sched_results(c, dp_id, stats):
-    c = conn.cursor()
    for factor,eff_curve in stats.iteritems():
        for eff_ts_util,sched in eff_curve.iteritems():
            # do prior results exist?
            c.execute('SELECT ntested, nsched, avg_sched, avg_tard, avg_bandwidth '
                      'FROM scaled_sched_results '
-                      'WHERE dp=? AND eff_ts_util=? AND scale_factor=?',
+                      'WHERE dp=%s AND eff_ts_util=%s AND scale_factor=%s FOR UPDATE',
                      (dp_id, eff_ts_util, factor))
            row = c.fetchone()
            if row:
@@ -459,16 +471,17 @@ def __store_eff_sched_results(conn, dp_id, stats):
                sched.ntested += rntested
                sched.nsched += rnsched
                c.execute('UPDATE scaled_sched_results '
-                          'SET ntested=?, nsched=?, avg_sched=?, avg_tard=?, avg_bandwidth=? '
+                          'SET ntested=%s, nsched=%s, avg_sched=%s, avg_tard=%s, avg_bandwidth=%s '
-                          'WHERE dp=? AND eff_ts_util=? AND scale_factor=?',
+                          'WHERE dp=%s AND eff_ts_util=%s AND scale_factor=%s',
                          (sched.ntested, sched.nsched, sched.avg_sched, sched.avg_tard, sched.avg_bandwidth, dp_id, eff_ts_util, factor))
            else:
                c.execute('INSERT INTO scaled_sched_results '
-                          'VALUES(?,?,?,?,?,?,?,?)',
+                          'VALUES(%s,%s,%s,%s,%s,%s,%s,%s)',
                          (dp_id, eff_ts_util, factor, sched.ntested, sched.nsched, sched.avg_sched, sched.avg_tard, sched.avg_bandwidth))
            
 def store_sched_results(db_name, data, ndp = 0):
    global dp_col_names
+    global dp_col_type_strs
    global max_fail
    col_names = None
@@ -488,33 +501,32 @@ def store_sched_results(db_name, data, ndp = 0):
                col_names.pop(0) # remove the id field
            with conn:
-                # if the db hangs, it's always here on the begin transaction...
+                c = conn.cursor(db.cursors.DictCursor)
-                begin_sync(conn)
                # get IDs for all design points
-                dp_ids = [__get_dp_id(conn, d.dp, add_if_missing = True) for d in data]
+                dp_ids = [__get_dp_id(c, d.dp, add_if_missing = True) for d in data]
                # insert the normal sched data in one go
-                conn.executemany('INSERT INTO sched_results VALUES(?,?,?,?,?,?,?)',
+                c.executemany('INSERT INTO sched_results VALUES(%s,%s,%s,%s,%s,%s,%s)',
                    [(dp_id, ts_util, stats.avg_sched, stats.ntested, stats.nsched, stats.avg_tard, stats.avg_bandwidth)
                        for dp_id, ts_util, stats in zip(dp_ids, [d.dp.sys_util for d in data], [d.sched_stats for d in data])])
                if d.eff_sched_stats is not None:
+                    begin_sync(conn, c)
                    for dp_id, d in zip(dp_ids, data):
-                        __store_eff_sched_results(conn, dp_id, d.eff_sched_stats)
+                        __store_eff_sched_results(c, dp_id, d.eff_sched_stats)
+                    end_sync(conn)
-                # try to fetch the next design points while we hold the db lock
-                # not the best thing to do for concurrency, but lustre's locking
-                # is very slow/costly.
                if ndp > 0:
-                    c = conn.cursor()
+                    begin_sync(conn, c)
-                    c.execute('SELECT %s FROM dp_pending LIMIT ?' % (','.join(dp_col_names)), (ndp,))
+#                    c.execute('LOCK TABLES dp_pending WRITE')
+                    c.execute('SELECT %s FROM dp_pending LIMIT %%s FOR UPDATE' % (','.join(dp_col_names)), (ndp,))
                    dps = [db_to_dp(d) for d in c.fetchall()]
                    nfetched = len(dps) if dps else 0
                    if nfetched > 0:
                        temp = [dp_to_db(d) for d in dps]
                        for i in xrange(nfetched):
                            c.execute('DELETE FROM dp_pending '
-                                      'WHERE %s' % ' AND '.join(map(lambda c: '%s=?'%c, dp_col_names)),
+                                      'WHERE %s' % ' AND '.join(map(lambda x: '%s=%s'%(x, dp_col_type_strs[x]), dp_col_names)),
                                      tuple(map(lambda x: temp[i][x], dp_col_names)))
                            if c.rowcount == 1:
                                fetched.append(dps[i])
@@ -522,12 +534,13 @@ def store_sched_results(db_name, data, ndp = 0):
                                print 'store_sched_results: raced for design point. dropping.'
                            else:
                                print 'store_sched_results: deleted too many rows... dropping.'
-                # commit all changes
+#                    c.execute('UNLOCK TABLES')
-                end_sync(conn)
+                    end_sync(conn)
+                c.close()
            # success!
            break
-        except lite.OperationalError:
+        except db.OperationalError, e:
+            print e
            if conn is not None:
                conn.close()
                del conn
@@ -540,7 +553,6 @@ def store_sched_results(db_name, data, ndp = 0):
                time_to_sleep = (random.random()*20 + 5)*failcount
                time_to_sleep = min(time_to_sleep, 4*60)
                print 'retrying store_sched_results from scratch. sleep = %.3f' % time_to_sleep
-                os.system('sync')
                time.sleep(time_to_sleep)
    if failcount > 0:
@@ -552,6 +564,4 @@ def store_sched_results(db_name, data, ndp = 0):
        del conn
        conn = None
-    os.system('sync')
    return fetched
diff --git a/rtss14/rtss14.py b/rtss14/rtss14.py
index 6d12361..09529c2 100755
--- a/rtss14/rtss14.py
+++ b/rtss14/rtss14.py
@@ -10,7 +10,8 @@ import math
 import time
 import inspect
-import sqlite3 as lite
+#import sqlite3 as lite
+import MySQLdb
@@ -128,6 +129,7 @@ def create_gpu_task_set(dp, overheads = None):
        else:
            ts[i].uses_gpu = False
        ts[i].nrequests = 0
+        ts[i].nengine_requests = 0
    for t in ts:
        t.wss = dp.wss
@@ -174,14 +176,19 @@ def create_gpu_task_set(dp, overheads = None):
                t.max_np_interval = max(t.max_np_interval, overheads.gpu_xmit.xmit_cost(XmitOverheads.H2D, min(t.stdata, dp.chunk_size)))
                t.max_np_interval = max(t.max_np_interval, overheads.gpu_xmit.xmit_cost(XmitOverheads.D2H, min(t.stdata, dp.chunk_size)))
-        # one for token lock, one for exec engine, one for each chunk
+        t.nkernrequests = 1
+        t.nsendrequests = int(ceil(float(t.sdata)/dp.chunk_size))
+        t.nrecvrequests = int(ceil(float(t.rdata)/dp.chunk_size))
        t.nstaterequests = int(ceil(float(t.stdata)/dp.chunk_size))
-        t.nrequests = 1 + \
+        if not dp.p2p:
-            1 + \
+            t.nstaterequests *= 2
-            int(ceil(float(t.sdata)/dp.chunk_size)) + \
+        if dp.rho > 0:
-            int(ceil(float(t.rdata)/dp.chunk_size)) + \
+            t.ntokenrequests = 1
-            t.nstaterequests
+        else:
-    
+            t.ntokenrequests = 0
+        t.nengine_requests = t.nkernrequests + t.nsendrequests + t.nrecvrequests + t.nstaterequests
+        t.nrequests = t.nengine_requests + t.ntokenrequests
    return ts
 def complete(results, n):
@@ -347,9 +354,11 @@ def is_schedulable(ts, dp, overheads):
        if len(g) <= tokensPerClstr or gsize == 1:
            for t in g:
                if t.stdata != 0:
+                    t.nrequests -= t.nstaterequests
+                    t.nengine_requests -= t.nstaterequests
+                    t.nstaterequests = 0
                    t.stdata = 0
                    t.stcost = 0.0
-                    t.nrequests -= t.nstaterequests
    # do we overutilize any of the clusters, or does migration
    # cause per-task constraint violations?
@@ -400,7 +409,7 @@ def is_schedulable(ts, dp, overheads):
            if not p or len(p) == 0:
                continue
-            if not jlfp.charge_scheduling_overheads(overheads, size, False, p):
+            if not jlfp.charge_scheduling_overheads(overheads, size, False, p, clusters, gclusters, dp):
 #                print 'failed in overhead charging'
                success = False
                break
@@ -714,7 +723,7 @@ def process_design_points(args):
            nfinished += nchunk
-    except lite.OperationalError:
+    except MySQLdb.OperationalError, e:
        print '%d: CRAP. Database Error while %s' % (os.getpid(), 'getting work.' if dps is None else 'storing results.')
        print traceback.format_exc()
    allend = time.time()
@@ -731,16 +740,17 @@ def TEST_get_dp_space(cpus):
    # system parameters
    exp.host = ['bonham']
    exp.ncpus = [12]
-#    exp.nclusters = [1, 2, 12]
    exp.nclusters = [1, 2, 12]
+#    exp.nclusters = [12]
    exp.ngpus = [8]
-#    exp.ngclusters = [1, 4, 8]
    exp.ngclusters = [1, 4, 8]
+#    exp.ngclusters = [8]
+    exp.release_master = [False]
    exp.ovh_type = ['mean']
    exp.polluters = [True]
    # gpusync config variables
-    exp.rho = [0,3]
+    exp.rho = [2]
    exp.dgl = [True,False]
    exp.p2p = [True,False]
 #    exp.ncopy_engines = [1, 2]
@@ -748,7 +758,8 @@ def TEST_get_dp_space(cpus):
    exp.chunk_size = [1*1024*1024] # 1MB (2MB?)
    # task parameters
-    exp.sys_util = [0.5,5.0,10.0]
+#    exp.sys_util = [0.5,5.0,10.0]
+    exp.sys_util = [5.0]
 #    exp.task_util = ['u-uni-medium', 'u-uni-heavy']
    exp.task_util = ['u-uni-medium']
    exp.period =    ['p-uni-long']
@@ -775,7 +786,8 @@ def get_dp_space(cpus):
    exp.nclusters = [1, 2, 12]
    exp.ngpus = [8]
 #    exp.ngclusters = [1, 2, 4, 8]
-    exp.ngclusters = [1, 4, 8]
+    exp.ngclusters = [1, 2, 4, 8]
+    exp.release_master = [False]
    exp.polluters = [False, True]
    exp.ovh_type = ['mean', 'max']
 #    exp.ovh_type = ['mean']
@@ -790,8 +802,8 @@ def get_dp_space(cpus):
    # task parameters
 #    step_size = 0.1
-    step_size = 0.5
+#    step_size = 0.5
-#    step_size = 0.25
+    step_size = 0.25
    start_pt = step_size
 #    start_pt = 10.0
    exp.sys_util =  [float(v) for v in arange(start_pt, cpus+step_size, step_size)]
@@ -807,9 +819,10 @@ def get_dp_space(cpus):
    exp.kern = ['k-uni-light', 'k-uni-medium', 'k-uni-heavy']
 #    exp.cpu_csx = ['c-const-light', 'c-const-medium', 'c-const-heavy']
    exp.cpu_csx = ['c-const-light']
-#    exp.data = ['d-const-light', 'd-const-medium', 'd-const-heavy', 'd-const-very-heavy'] 
+#    exp.data = ['d-const-light', 'd-const-medium', 'd-const-heavy', 'd-const-very-heavy']
-    exp.data = ['d-const-light', 'd-const-medium', 'd-const-heavy']
+    exp.data = ['d-uni-light', 'd-uni-medium', 'd-uni-heavy']
-    exp.state = ['s-const-zero', 's-const-light', 's-const-medium']
+#    exp.state = ['s-const-zero', 's-const-light', 's-const-medium', 's-const-heavy']
+    exp.state = ['s-const-zero', 's-uni-light', 's-uni-medium']
    return exp
@@ -844,8 +857,8 @@ def main():
        host = 'bonham'
        
        cpus = 12.0
-        exp = get_dp_space(cpus)
+#        exp = get_dp_space(cpus)
-#        exp = TEST_get_dp_space(cpus)
+        exp = TEST_get_dp_space(cpus)
        def valid(dp):
            # filter out gpus shared between cpu clusters for now
@@ -873,18 +886,47 @@ def main():
            return True
-        design_points = [dp for dp in DesignPointGenerator(exp, is_valid = valid)]
+        if not args.pretend and not args.resume:
-        ndp = len(design_points)
+            db.clear_tables(args.database)
+        # load design points incrementally as to not overload memory
+        gen = DesignPointGenerator(exp, is_valid = valid)
+        dp_chunk_sz = 10000  # number of design points to load at a time
+        ndp = 0
        if not args.pretend:
-            random.shuffle(design_points)
+            nstored = 0
-            if not args.resume:
+            npending = 0
-                db.clear_tables(args.database)
+            while True:
-            nstored, npending = db.store_design_points(args.database, design_points, clean = not args.resume)
+                dp_chunk = []
-            print "Loaded %d of %d design points. (%d were completed or pending, %d now pending)" % (
+                for _ in range(dp_chunk_sz):
-                nstored, ndp, ndp - nstored, npending)
+                    try:
+                        dp_chunk.append(gen.next())
+                    except StopIteration:
+                        break
+                if len(dp_chunk) == 0:
+                    break
+                ndp += len(dp_chunk)
+                random.shuffle(dp_chunk)
+                tmpnstored, npending = db.store_design_points(args.database, dp_chunk, clean = not args.resume)
+                nstored += tmpnstored
+            print "Loaded %d of %d design points. (%d were completed or pending, %d now pending)" % (nstored, ndp, ndp - nstored, npending)
        else:
+            for _ in gen:
+                ndp += 1
            print "%d design points planned" % ndp
+#        design_points = [dp for dp in DesignPointGenerator(exp, is_valid = valid)]
+#        ndp = len(design_points)
+#        if not args.pretend:
+#            random.shuffle(design_points)
+#            if not args.resume:
+#                db.clear_tables(args.database)
+#            nstored, npending = db.store_design_points(args.database, design_points, clean = not args.resume)
+#            print "Loaded %d of %d design points. (%d were completed or pending, %d now pending)" % (
+#                nstored, ndp, ndp - nstored, npending)
+#        else:
+#            print "%d design points planned" % ndp
    if args.pretend or args.initonly:
        exit(0)
author	Glenn Elliott <gelliott@cs.unc.edu>	2014-09-21 18:35:53 -0400
committer	Glenn Elliott <gelliott@cs.unc.edu>	2014-09-21 18:35:53 -0400
commit	0d0de82d3994f3e737925e1e6a3e4d403375e529 (patch)
tree	f6d883c29fc61bf089a7232328841fc990306317
parent	d3efc4e3241fdc1d2ec8524a424a29895dc86a22 (diff)