from __future__ import print_function import os import numpy as np import matplotlib as mpl import krr as kr import datetime as dt import orcaparse as op # ------------------------Dictionary ML----------------------------------------- krr_l1_param = \ { # structural sigma used with laplace and gaussian kernels 'struct_sigma': 900, # electric sigma used with gaussian kernel 'electric_sigma': 1, # the next two values is for the select mechanisms # what is the min % acceptable for the new learn set 'min_select_size': None, # the step to decrease (t/threshold_adjust) the threshold # in order to rich the above % 'threshold_adjust': None, # function to compute the distance between descriptors,kernel dependent 'dist_function': kr.dist_l1_d1_d2, # function to compute the distance matrix,kernel dependent 'dist2kernel': kr.distmatrix2kernel, # kernel to use 'kernel': kr.laplace_kernel_all, # gamma value of the kernel 'kernel_gamma': 0.0, # function to make predictions, kernel dependent 'predict_func': kr.predict_v_l1, # path to the data base on this file 'data_path': "../data", # to get some output, and where to put it 'debug': True, 'debug_filepath': "./ml_out_put/output_" + dt.datetime.now().time().strftime("%I_%M_%S") + ".out" } krr_l2_param = \ { # structural sigma used with laplace and gaussian kernels # TODO each structural descriptor must have his own sigma 'struct_sigma': 900, # electric sigma used with gaussian kernel # TODO each electrical descriptor must have his own sigma 'electric_sigma': 1, # the next two values is for the select mechanisms # what is the min % acceptable for the new learn set 'min_select_size': None, # the step to decrease (t/threshold_adjust) the threshold # in order to rich the above % 'threshold_adjust': None, # function to compute the distance between descriptors,kernel dependent 'dist_function': kr.dist_l2_d1_d2, # function to compute the distance matrix,kernel dependent 'dist2kernel': kr.distmatrix2kernel, # kernel to use 'kernel': kr.gausian_kernel_all, # gamma value of the kernel 'kernel_gamma': 0.0, # function to make predictions, kernel dependent 'predict_func': kr.predict_v_l2, # path to the data base on this file 'data_path': "../data", # to get some output, and where to put it 'debug': True, 'debug_filepath': "./ml_out_put/output_" + dt.datetime.now().time().strftime("%I_%M_%S") + ".out" } # -----------------------------Utilities Functions------------------------------ def write_file(data, file_path=None): """ write a string in a file :param data: string :type data: :param file_path: :type file_path: :return: :rtype: None """ if os.path.exists(file_path): flag = 'a' else: flag = 'w' file = open(file_path, flag) file.write(data) file.close() def plot_png(axis_dataset, orden_dataset=None, graph_title="Image", file_path="image.png"): """ create an image with some data, TODO need to be review :param axis_dataset: :type axis_dataset: :param orden_dataset: :type orden_dataset: :param graph_title: :type graph_title: :param file_path: :type file_path: :return: :rtype: """ mpl.use('Agg') import matplotlib.pyplot as plt plt.plot(axis_dataset, orden_dataset, 'x-') plt.title(graph_title) plt.grid() plt.savefig("./pictures/" + file_path) plt.close() def plot_window(ntestsys, test_set, prediction, errs, ref_range, ref_key): import matplotlib.pyplot as plt # using rainbow colors from : # http://stackoverflow.com/questions/8389636/creating-over-20-unique-legend-colors-using-matplotlib NUM_COLORS = ref_range cm = plt.get_cmap('gist_rainbow') fig = plt.figure(1) plt.subplot(2, 2, 1) ax = fig.add_subplot(221) ax.set_color_cycle([cm(1. * i / NUM_COLORS) for i in range(NUM_COLORS)]) plt.ylabel('Prediction') plt.xlabel('Real Values') # plt.title("Real Value vs Prediction eV") plt.grid() smallest = [] largest = [] for j in range(ntestsys): smallest.append(test_set[ref_key][j][0]) largest.append(test_set[ref_key][j][ref_range - 1]) x1 = np.minimum(np.min(smallest), np.min(prediction)) x2 = np.maximum(np.max(largest), np.max(prediction)) x1 -= 0.1 * x1 x2 += 0.1 * x1 plt.axis([x1, x2, x1, x2]) kr_line = [x1, x2] plt.plot(kr_line, kr_line, 'k-', alpha=0.75, zorder=0) for i in range(ref_range): r_value = [] p_value = [] for j in range(ntestsys): r_value.append(test_set[ref_key][j][i]) p_value.append(prediction[j][i]) plt.plot(r_value, p_value, 'o', label="{} {}".format(ref_key, i)) plt.subplot(2, 2, 1).legend(numpoints=1, loc='best', ncol=5, fontsize=8) plt.subplot(2, 2, 2) plt.ylabel('Error') plt.xlabel("{} level".format(ref_key)) plt.axis([0, ref_range - 1, 0, np.max(errs) + 0.1 * np.max(errs)]) plt.grid() for i in range(ntestsys): plt.plot(range(ref_range), errs[i], '^-', label="t {}".format(i)) plt.subplot(2, 2, 3) ax = fig.add_subplot(223) ax.set_color_cycle([cm(1. * i / NUM_COLORS) for i in range(NUM_COLORS)]) plt.ylabel('Error') plt.xlabel('Test System') plt.axis([0, ntestsys - 1, 0, np.max(errs) + 0.1 * np.max(errs)]) plt.grid() mae_j = np.zeros((ref_range)) for i in range(ref_range): mae_j[i] = np.array([errs[j][i] for j in range(ntestsys)]).sum(axis=0) plt.plot(range(ntestsys), [errs[j][i] for j in range(ntestsys)], 'o') mae_j /= ntestsys for i in range(ref_range): plt.plot(range(ntestsys), [mae_j[i] for j in range(ntestsys)], '-', linewidth=2, label="mea({} {})".format(ref_key, i)) plt.subplot(2, 2, 3).legend(numpoints=1, loc='best', ncol=5, fontsize=8) plt.subplot(2, 2, 4) plt.ylabel('Error') plt.xlabel("{} level".format(ref_key)) plt.grid() plt.plot(range(ref_range), mae_j, '^-', label="MeanError") plt.subplot(2, 2, 4).legend(numpoints=1, loc='best', ncol=5) plt.show() def read_sys(nsystems, index=None, xyz=True, theory="pbe0", bhlgap=True, bdos=True, befermi=True, emin=-10., emax=10., eresolution=1, sigma=1, path_data="./data"): """ this function read data from file (3/feb/2017, from Orca calc) and return systems and all data extract from file :param nsystems: :param index: :param xyz: :param theory: :param bhlgap: :param bdos: :param befermi: :param emin: :param emax: :param eresolution: :param sigma: :param path_data: :return: """ systems, data, sel_index = \ kr.read_orca22json(nsystems, index, xyz, theory, data_root_dir=path_data) all_gs = op.add_ks_desc([data[i]['gs'] for i in range(nsystems)], emin=emin, bdos=bdos, bhlgap=bhlgap, befermi=befermi, emax=emax, eresolution=eresolution, sigma=sigma) return systems, all_gs, [data[i]['absspec'] for i in range(nsystems)],\ sel_index def read_sys_data(nsystems, index=None, xyz=True, theory="pbe0", energies=True, hlgap=True, path_data="./data"): return kr.read_orca22json(nsystems, index, xyz, theory, energies, hlgap, path_data) def get_desc_keys(all_gs, all_absspec): """ this function helps to know al the descriptors that the code can manage so the user can select a subsect of this :param all_gs: all ground state data :type all_gs: dictionary :param all_absspec: all especifications :type all_absspec: dictionary :return: key :rtype: [string] """ keys = [] keys.append('sys') for (gs, spec) in zip(all_gs, all_absspec): for k in gs: keys.append(k) for k in spec: keys.append(k) break return keys def select_desc_prop(systems, all_gs, all_absspec, keys): """ put all descriptors in a simple dic structure :param systems: all systems read from files :type systems: [ase.atoms] :param all_gs: dictionary with the systems ground state properties :type all_gs: dictionary {'key':[[s1],[s1]],'key2':[[2,3,4,..],[[3,3,3,..]]]...} :param all_absspec: dictionary with the systems all other properties :type all_absspec: dictionary{} same estructure :param keys: list of keys :type keys: [string] :return: dictionary :rtype: [key:[[]], key2:[[],[]]....] """ desc_prop = {} for key in keys: desc_prop[key] = [] for (sys, gs, spec) in zip(systems, all_gs, all_absspec): desc_prop['sys'].append(sys) for key in keys: if key in gs: desc_prop[key].append(gs[key]) elif key in spec: desc_prop[key].append(spec[key]) return desc_prop def get_test_index(nsys, ntestsystems, seed=None): """ generate the random set of test indices from the number of systems :param nsys: number of systems :param ntestsystems: number of wanted test systems :param seed: seed for random algorithm :return: [int] """ test_indices = kr.gen_random_index(nsys, ntestsystems, seed) return test_indices def get_training_set(desc_prop, keys, nsys, test_indices): learn_set = {} test_set = {} learn_index = [] for key in keys: if key not in learn_set: # initialize arrays for new keys learn_set[key] = [] test_set[key] = [] for i in test_indices: test_set[key].append(desc_prop[key][i]) for i in range(nsys): if i not in set(test_indices): learn_set[key].append(desc_prop[key][i]) if len(learn_index) < (nsys - len(test_indices)): learn_index.append(i) # print("learn index: ") # print(learn_index) return learn_set, test_set # --------------------ML-Functions---------------------------------------------- def selective_test(learn_desc, test_desc, param): """ select best fit learn systems to predict a test descriptor :param learn_desc: list of all learn descriptors :type learn_desc: GDesc class object with multiples systems :param test_desc: test descriptor :type test_desc: GDesc class object with only one system :param param: dictionary with the ML parameters :type param: dictionary, especial Key must exist :return: index, threshold :rtype: [int], float """ return kr.select_best_fit_desc(learn_desc=learn_desc, test_desc=test_desc, min_select_size=param['min_select_size'], threshold_adjust=param['threshold_adjust'], dist_d1d2=param['dist_function'], dist2kernel=param['dist2kernel'], s_sigma=param['struct_sigma'], e_sigma=param['electric_sigma']) def learn_something(ml, learn_desc, learn_ref, param): """ construct the prediction model, initialize :param ml: this parameter avoids the kernel matrix recalculation :type ml: kr.Machine class object :param learn_desc: input learn descriptors :type learn_desc: GDesc class object :param learn_ref: the actual values :type learn_ref: [float] :param param: dictionary kernel parameters :type param: dictionary :return: model generated by kr.Machine class :rtype: kr.Machine class object """ if ml is None: ml = kr.Machine(learn_desc, param['struct_sigma'], param['electric_sigma'], param['kernel']) ml.coeff_cal(learn_ref, param['kernel_gamma']) return ml def predict_something(ml, test_desc, param): """ this is more intuitive funtion for predicttios :param ml: already created model :type ml: Machine object class :param test_desc: test descriptor to predict :type test_desc: GDesc class object :param param: dictionary with the kernel parameters :type param: dictionary :return: value, or multiples values always in a numpy array :rtype: np.array[float] """ return ml.predict(test_desc, param['predict_func']) # --------------------Entry_Point Function-------------------------------------- def learn_method(learn_set, nlearnsys, test_set, ntestsys, struc_key=None, struc_desc_method=None, elec_key=None, electr_desc_method=None, ref_key=None, selecting_method=None, ref_range=1, ml_krr_param=None): """ this function will iterate on ntestsys, evaluating the function given by the parameter 'selecting_method', only build descriptors and separate, references :param learn_set: dictionary with all learning system and data :type learn_set: dictionary with the same structure handle so far :param nlearnsys: numbers of systems :type nlearnsys: int :param test_set: dictionary with all test system and data :type test_set: dictionary with the same structure handle so far :param ntestsys: number of test systems :type ntestsys: int :param struc_key: names of the key in the dictionary that represent the the structure descriptor :type struc_key: [string] :param elec_key: names of the key in the dictionary that represent the the electric descriptor :type elec_key: [string] :param ref_key: names of the key on both dictionary that represent the the reference parameter :type ref_key: string :param selecting_method: function pointer to the method who actually does the ML procces :type selecting_method: function pointer :param ref_range: to compute the whole expectro :param ref_range: int :param ml_krr_param: dictionary with the extra parameters for the 'selecting_method' function :type ml_krr_param: dictionary :return: errs, ref_values, prediction :rtype: [] """ assert struc_key or elec_key, \ "At least one type of descriptor must be define" assert ref_key, "Reference key must exist in this context" test_set_struct = None test_set_elect = None learn_set_struct = None learn_set_elect = None # creating a learn descriptor for the if struc_key: learn_set_struct = \ {k: learn_set[k] for k in learn_set if k in struc_key} if elec_key: learn_set_elect = \ {k: learn_set[k] for k in learn_set if k in elec_key} learn_desc = kr.GDesc(struc_desc=learn_set_struct, struc_desc_method=struc_desc_method, elect_desc=learn_set_elect, electr_desc_method=electr_desc_method, nsystems=nlearnsys) prediction = np.zeros(shape=(len(test_set[ref_key]), ref_range)) errs = np.zeros(shape=(len(test_set[ref_key]), ref_range)) ref_values = np.zeros(shape=(len(test_set[ref_key]), ref_range)) # if debug flag is set... if ml_krr_param['debug']: title = "{0:6}|{1:8}|{2:9}|{3:11}|{4:5}|{5:12}|{6:12}|{7:12}|{8:12}\n" data_form = "{0:6}|{1:8}|{2:9}|{3:11}|{4:5}|{5:.10f}|" \ "{6:.10f}|{7:.10f}|{8:.10f}\n" write_file(title.format("index", "Index_DB", "Label", "Energ_Level", "#TS", "Threshold", "MAE", "RV", "PV"), ml_krr_param['debug_filepath']) ml = None threshold = float(0) for i in range(ntestsys): # this part can be completely parallel because each process is # independent from each other if struc_key: test_set_struct = \ {k: test_set[k][i] for k in test_set if k in struc_key} if elec_key: test_set_elect = \ {k: test_set[k][i] for k in test_set if k in elec_key} # we are trying this one by one test_desc = kr.GDesc(struc_desc=test_set_struct, struc_desc_method=struc_desc_method, elect_desc=test_set_elect, electr_desc_method=electr_desc_method, nsystems=1) if selecting_method is not None: sel_index, threshold = selecting_method(learn_desc, test_desc, ml_krr_param) for j in range(ref_range): if selecting_method is None: nlearn_desc = learn_desc threshold = 0 learn_ref = \ [learn_set[ref_key][r][j] for r in range(len(learn_set[ref_key]))] sel_index = [] test_ref = test_set[ref_key][i][j] else: ml = None s_index = set(sel_index) learn_ref = [learn_set[ref_key][r] for r in sel_index] learn_ref = [learn_ref[r][j] for r in range(len(learn_ref))] test_ref = test_set[ref_key][i][j] nlearn_set_struct = None nlearn_set_elect = None if struc_key: nlearn_set_struct = \ {k: [learn_set_struct[k][h] for k in learn_set_struct for h, ds in enumerate(learn_set_struct[k]) if h in s_index]} if elec_key: nlearn_set_elect = \ {k: [learn_set_elect[k][h] for k in learn_set_elect for h, ds in enumerate(learn_set_elect[k]) if h in s_index]} # rebuilding the learn_descriptor nlearn_desc = kr.GDesc(struc_desc=nlearn_set_struct, elect_desc=nlearn_set_elect, nsystems=len(sel_index)) ml = learn_something(ml, nlearn_desc, learn_ref, ml_krr_param) p = predict_something(ml, test_desc, ml_krr_param) prediction[i, j] = p[0] errs[i, j] = np.abs(p[0] - test_ref) ref_values[i, j] = test_ref if ml_krr_param['debug']: write_file(data_form.format(i, ml_krr_param['test_index'][i], test_set['sys'][i]. get_chemical_formula(mode='reduce'), j, len(sel_index), threshold, errs[i, j], test_ref, prediction[i, j]), ml_krr_param['debug_filepath']) # plot_window(ntestsys, test_set, prediction, # errs, ref_range, ref_key) return errs, ref_values, prediction def beaker_learn_method(learn_set, nlearnsys, test_set, ntestsys, struc_key=None, struc_desc_method=None, elec_key=None, ref_key=None, ml_krr_param=None): """ this function will iterate on ntestsys, evaluating the function given by the parameter 'selecting_method', only build descriptors and separate, references :param learn_set: dictionary with all learning system and data :type learn_set: dictionary with the same structure handle so far :param nlearnsys: numbers of systems :type nlearnsys: int :param test_set: dictionary with all test system and data :type test_set: dictionary with the same structure handle so far :param ntestsys: number of test systems :type ntestsys: int :param struc_key: names of the key in the dictionary that represent the the structure descriptor :type struc_key: [string] :param elec_key: names of the key in the dictionary that represent the the electric descriptor :type elec_key: [string] :param ref_key: names of the key on both dictionary that represent the the reference parameter :type ref_key: string :param selecting_method: function pointer to the method who actually does the ML procces :type selecting_method: function pointer :param ref_range: to compute the whole expectro :param ref_range: int :param ml_krr_param: dictionary with the extra parameters for the 'selecting_method' function :type ml_krr_param: dictionary :return: errs, ref_values, prediction :rtype: [] """ assert struc_key or elec_key, \ "At least one type of descriptor must be define" assert ref_key, "Reference key must exist in this context" test_set_struct = None test_set_elect = None learn_set_struct = None learn_set_elect = None # creating a learn descriptor for the if struc_key: learn_set_struct = {struc_key: learn_set[struc_key]} if elec_key: learn_set_elect = {elec_key: learn_set[elec_key]} learn_desc = kr.GDesc(struc_desc=learn_set_struct, struc_desc_method=struc_desc_method, elect_desc=learn_set_elect, electr_desc_method=None, nsystems=nlearnsys, maxlen=ml_krr_param['maxlen']) prediction = np.zeros(shape=(len(test_set[ref_key]), 1)) errs = np.zeros(shape=(len(test_set[ref_key]), 1)) ref_values = np.zeros(shape=(len(test_set[ref_key]), 1)) # if debug flag is set... if ml_krr_param['debug']: title = "{0:6}|{1:8}|{2:9}|{3:11}|{4:5}|{5:12}|{6:12}|{7:12}|{8:12}\n" data_form = "{0:6}|{1:8}|{2:9}|{3:11}|{4:5}|{5:.10f}|" \ "{6:.10f}|{7:.10f}|{8:.10f}\n" write_file(title.format("index", "Index_DB", "Label", "Energ_Level", "#TS", "Threshold", "MAE", "RV", "PV"), ml_krr_param['debug_filepath']) learn_ref = learn_set[ref_key] ml = learn_something(None, learn_desc, learn_ref, ml_krr_param) for i in range(ntestsys): # this part can be completely parallel because each process is # independent from each other if struc_key: test_set_struct = {struc_key: test_set[struc_key][i]} if elec_key: test_set_elect = {elec_key: test_set[elec_key][i]} # we are trying this one by one test_desc = kr.GDesc(struc_desc=test_set_struct, struc_desc_method=struc_desc_method, elect_desc=test_set_elect, electr_desc_method=None, nsystems=1, maxlen=ml_krr_param['maxlen']) test_ref = test_set[ref_key][i] p = predict_something(ml, test_desc, ml_krr_param) prediction[i] = p[0] errs[i] = np.abs(p[0] - test_ref) ref_values[i] = test_ref if ml_krr_param['debug']: write_file(data_form.format(i, ml_krr_param['test_index'][i], test_set[struc_key][i]. get_chemical_formula(mode='reduce'), float(0), 0, float(0), np.abs(p[0] - test_ref), test_ref, p[0]), ml_krr_param['debug_filepath']) # plot_window(ntestsys, test_set, prediction, # errs, ref_range, ref_key) return errs, ref_values, prediction # --------------------Test Main------------------------------------------------- def test(nsys, delta=False): ntsys = int(nsys / 10) krr_l1_param['debug_filepath'] = "./ml_out_put/output_" + \ dt.datetime.now().time().strftime("%I_%M_%S") krr_l1_param['data_path'] = "./data" with kr.timer('read data'): if delta: sys, pbe0_gs, pbe0_abs_spec, sys_index = \ read_sys(nsys, theory="pbe0", path_data=krr_l1_param['data_path']) lda_sys, lda_gs, lda_abs_spec, sys_index = \ read_sys(nsys, theory="lda", index=sys_index, xyz=False, path_data=krr_l1_param['data_path']) keys = get_desc_keys(pbe0_gs, pbe0_abs_spec) desc = select_desc_prop(sys, pbe0_gs, pbe0_abs_spec, keys) lda_desc = select_desc_prop(sys, lda_gs, lda_abs_spec, keys) pbe0_ener = np.array(desc['exe']) lda_ener = np.array(lda_desc['exe']) desc['exe'] = (pbe0_ener - lda_ener) / 8065.54429 else: sys, gs, abs_spec, sys_index = \ read_sys(nsys, theory="pbe0", path_data=krr_l1_param['data_path']) keys = get_desc_keys(gs, abs_spec) desc = select_desc_prop(sys, gs, abs_spec, keys) ener = np.array(desc['exe']) desc['exe'] = ener / 8065.54429 tindex = get_test_index(nsys=nsys, ntestsystems=ntsys) krr_l1_param['test_index'] = [sys_index[i] for i in tindex] learn_set, test_set = get_training_set(desc_prop=desc, keys=keys, nsys=nsys, test_indices=tindex) struc_desc_method = {'key': 'sys', 'method': kr.sorted_coulomb_matrix} krr_l1_param['debug_filepath'] += ".out" # NORMAL TEST WITH PRUNING with kr.timer('NORMAL TEST WITH PRUNING'): krr_l1_param['struct_sigma'] = 900 krr_l1_param['electric_sigma'] = None learn_method(learn_set, nsys - ntsys, test_set, ntsys, struc_key=['sys'], struc_desc_method=struc_desc_method, # elec_key=['hlgap'], elec_key=None, electr_desc_method=None, selecting_method=selective_test, ref_key='exe', ml_krr_param=krr_l1_param) def beaker_test_run(db_type, nsys, ntsys, theory_type, selecting_method, struc_key=None, struc_desc_method=None, elec_key=None, electr_desc_method=None, ref_key=None, ref_range=None, ml_param=None): if ref_key == 'fosc': energie = False else: energie = True with kr.timer('read data'): if db_type == "8_CONF": # this is for changing data base when we have more than one pass if theory_type == "delta": sys, pbe0_desc, sys_index = \ read_sys_data(nsys, theory="pbe0", energies=energie, path_data=ml_param['data_path']) lda_sys, desc, sys_index = \ read_sys_data(nsys, theory="lda", index=sys_index, xyz=False, energies=energie, path_data=ml_param['data_path']) desc[struc_key] = sys desc[ref_key] = (np.array(pbe0_desc[ref_key]) - np.array(desc[ref_key])) else: sys, desc, sys_index = \ read_sys_data(nsys, theory=theory_type, energies=energie, path_data=ml_param['data_path']) desc[struc_key] = sys ml_param['maxlen'] = max(len(atoms) for atoms in sys) tindex = get_test_index(nsys=nsys, ntestsystems=ntsys, seed=dt.datetime.now().microsecond) keys = [i for i in desc] ml_param['test_index'] = [sys_index[i] for i in tindex] print("sys index") print(sys_index) print("test index") print(ml_param['test_index']) learn_set, test_set = get_training_set(desc_prop=desc, keys=keys, nsys=nsys, test_indices=tindex) struc_method = {'key': struc_key, 'method': struc_desc_method} with kr.timer('Time for the whole test'): return beaker_learn_method(learn_set, nsys - ntsys, test_set, ntsys, struc_key=struc_key, struc_desc_method=struc_method, elec_key=elec_key, ref_key=ref_key, ml_krr_param=ml_param) def beaker_entry(param): db_type = None theory_type = None selecting_method = None struc_desc_method = None elec_key = None ref_key = None ml_param = {} ml_param['threshold_adjust'] = float(0) ml_param['min_select_size'] = float(0) if param['ganma'] == '': ml_param['kernel_gamma'] = 0 else: ml_param['kernel_gamma'] = param['ganma'] ml_param['data_path'] = "./data" ml_param['debug'] = True ml_param['debug_filepath'] = "./ml_out_put/output_" + \ dt.datetime.now().time().strftime("%I_%M_%S") \ + ".out" # feature param for switching data base, fixed if param['db_type'] == 1: db_type = "8_CONF" nsys = param['training_set_size'] + param['test_set_size'] assert nsys <= 21785, "data base not big enough" # if param['training_set_size'] == 0: # nsys = 1000 # elif param['training_set_size'] == 1: # nsys = 2000 # elif param['training_set_size'] == 2: # nsys = 5000 # elif param['training_set_size'] == 3: # nsys = 10000 # elif param['training_set_size'] == 4: # nsys = 15000 ntestsys = param['test_set_size'] # if param['test_set_size'] == 0: # ntestsys = int(nsys/100) # elif param['test_set_size'] == 1: # ntestsys = int(nsys*2 / 100) # elif param['test_set_size'] == 2: # ntestsys = int(nsys*10 / 100) # elif param['test_set_size'] == 3: # ntestsys = int(nsys*20 / 100) # elif param['test_set_size'] == 4: # ntestsys = int(nsys*50 / 100) ml_param['dist2kernel'] = kr.distmatrix2kernel if param['ml_norm'] == 1: ml_param['dist_function'] = kr.dist_l1_d1_d2 ml_param['kernel'] = kr.laplace_kernel_all ml_param['predict_func'] = kr.predict_v_l1 elif param['ml_norm'] == 2: ml_param['dist_function'] = kr.dist_l1_d1_d2 ml_param['kernel'] = kr.laplace_kernel_all ml_param['predict_func'] = kr.predict_v_l1 if param['theory_type'] == 1: theory_type = "lda" elif param['theory_type'] == 2: theory_type = "pbe0" elif param['theory_type'] == 3: theory_type = "delta" if param['method'] == 1: selecting_method = selective_test elif param['method'] == 2: selecting_method = None if param['s_desc'] == 3 and param['e_desc'] == 3: assert "one descriptor must be selected" if param['s_desc'] == 3: struc_key = None ml_param['struct_sigma'] = None else: struc_key = 'sys' ml_param['struct_sigma'] = param['s_sigma'] if param['s_desc'] == 1: struc_desc_method = kr.sorted_coulomb_matrix elif param['s_desc'] == 2: struc_desc_method = kr.diagonalize_coulumb_matrix electr_desc_method = None if param['e_desc'] == 3: elec_key = None ml_param['electric_sigma'] = None elif param['e_desc'] == 1: ml_param['electric_sigma'] = param['e_sigma'] elec_key = 'hlgap' elif param['e_desc'] == 2: ml_param['electric_sigma'] = param['e_sigma'] elec_key = 'dos' ref_range = param['range'] if param['proper'] == 1: ref_key = 'exe' elif param['proper'] == 2: ref_key = 'fosc' thekwargs = dict(db_type=db_type, nsys=nsys, ntsys=ntestsys, theory_type=theory_type, selecting_method=selecting_method, struc_key=struc_key, struc_desc_method=struc_desc_method, elec_key=elec_key, electr_desc_method=electr_desc_method, ref_key=ref_key, ref_range=ref_range, ml_param=ml_param) for kw in thekwargs: print(kw, repr(thekwargs[kw])) errs, ref_values, prediction = \ beaker_test_run(db_type=db_type, nsys=nsys, ntsys=ntestsys, theory_type=theory_type, selecting_method=selecting_method, struc_key=struc_key, struc_desc_method=struc_desc_method, elec_key=elec_key, electr_desc_method=electr_desc_method, ref_key=ref_key, ref_range=ref_range, ml_param=ml_param) return errs, ref_values, prediction, ml_param['debug_filepath'] if __name__ == '__main__': # with kr.timer('read all'): # systems, data, index = kr.read_orca22json(1000, None, True, "lda", # True, True, # "./data") # systems, data, index = kr.read_orca22json(21500, None, True, "pbe0", # True, True, # "./data") # test(3000) param = {'db_type': 1, 'ml_norm': 1, 'theory_type': 3, 'e_sigma': 1, 'e_desc': 1, 'training_set_size': 4, 'range': 1, 's_sigma': 1000, 'ganma': 0.02, 's_desc': 1, 'proper': 1, 'method': 3, 'test_set_size': 4, 'theory_type': 3} # 1=LDA, 2=PBE0, 3=Delta # {'db_type': 1, 'ml_norm': 1, 'e_desc': 3, 'e_sigma': 1, # 'theory_type': 2, 'training_set_size': 2000, 'range': 1, # 's_sigma': 1000, 'ganma': 0.02, 's_desc': 2, 'proper': 1, # 'method': 2, 'test_set_size': 1} beaker_entry(param) # test(20, False) # op.two_json("./orca", "lda") # op.two_json("./orca", "pbe0")