import sys, os from libtbx import group_args from scitbx.array_family import flex def show(histogram): h_1 = histogram lc_1 = histogram.data_min() s_1 = enumerate(histogram.slots()) for (i_1,n_1) in s_1: hc_1 = h_1.data_min() + h_1.slot_width() * (i_1+1) print "%6.2f - %6.2f: %5d" % (lc_1,hc_1,n_1) lc_1 = hc_1 def file_to_object(lines): ### pdbid = None unit_cell = None space_group = None # n_models = None n_altconf = None # other = None dna_rna = None amino_acid = None water = None small_molecule = None # data_label = None high_resolution = None low_resolution = None completeness_in_range = None completeness_d_min_inf = None completeness_6_inf = None wilson_b = None number_of_reflections = None test_set_size = None test_flag_value = None number_of_Fobs_outliers = None anomalous_flag = None # number_of_atoms = None adp_min = None adp_max = None adp_mean = None occ_min = None occ_max = None occ_mean = None # bonds_mean = None bonds_max = None bonds_count = None angles_mean = None angles_max = None angles_count = None dihedrals_mean = None dihedrals_max = None dihedrals_count = None chirality_mean = None chirality_max = None chirality_count = None planarity_mean = None planarity_max = None planarity_count = None non_bonded_min = None # r_work = None r_free = None kb_sol = [None, None] b_cart = None # program_name_pub = None year_pub = None r_work_pub = None r_free_pub = None high_resolution_pub = None low_resolution_pub = None sigma_cutoff_pub = None ### for x in lines: x = x.strip() xs = x.split() if(x.startswith("Model and data files:")): pdbid = xs[5][:-4] if(x.startswith("unit cell:" )): unit_cell = xs[2:] if(x.startswith("space group:" )): space_group = xs[2:] # if(x.startswith("Number of models:")): n_models = int(xs[3]) if(x.startswith("Number of residues in alternative conformations:")): n_altconf = int(xs[6]) # if(x.startswith("other")): other = int(xs[2]) if(x.startswith("dna_rna")): dna_rna = int(xs[2]) if(x.startswith("amino_acid")): amino_acid = int(xs[2]) if(x.startswith("water")): water = int(xs[2]) if(x.startswith("small_molecule")): small_molecule = int(xs[2]) # if(x.startswith("data_label :")): data_label = xs[2] if(x.startswith("high_resolution :")): high_resolution = float(xs[2]) if(x.startswith("low_resolution :")): low_resolution = float(xs[2]) if(x.startswith("completeness_in_range :")): completeness_in_range = float(xs[2]) if(x.startswith("completeness(d_min-inf) :")): completeness_d_min_inf = float(xs[2]) if(x.startswith("completeness(6A-inf) :")): completeness_6_inf = float(xs[2]) if(x.startswith("wilson_b :")): wilson_b = xs[2] if(x.startswith("number_of_reflections :")): number_of_reflections = int( xs[2]) if(x.startswith("test_set_size(%) :")): test_set_size = float(xs[2]) if(x.startswith("test_flag_value :")): test_flag_value = xs[2] if(x.startswith("number_of_Fobs_outliers :")): number_of_Fobs_outliers = int( xs[2]) if(x.startswith("anomalous_flag :")): anomalous_flag = xs[2] # if(x.startswith("atom_number_(type:count:occ_sum) :")): number_of_atoms = int(xs[2]) if(x.startswith("ADP_(min,max,mean) :")): adp_min, adp_max, \ adp_mean = float(xs[2]), \ float(xs[3]), float(xs[4]) if(x.startswith("occupancies_(min,max,mean) :")): occ_min, occ_max, \ occ_mean = float(xs[2]), \ float(xs[3]), float(xs[4]) if(x.startswith("number_of_anisotropic :")): int(xs[2]) if(x.startswith("number_of_non_positive_definite :")): int(xs[2]) # if(x.startswith("bonds :")): bonds_mean, bonds_max, bonds_count = \ float(xs[2]), float(xs[3]), int(xs[4]) if(x.startswith("angles :")): angles_mean, angles_max, angles_count = \ float(xs[2]), float(xs[3]), int(xs[4]) if(x.startswith("dihedrals :")): dihedrals_mean, dihedrals_max, dihedrals_count = \ float(xs[2]), float(xs[3]), int(xs[4]) if(x.startswith("chirality :")): chirality_mean, chirality_max, chirality_count = \ float(xs[2]), float(xs[3]), int(xs[4]) if(x.startswith("planarity :")): planarity_mean, planarity_max, planarity_count = \ float(xs[2]), float(xs[3]), int(xs[4]) if(x.startswith("non-bonded (min) :")): non_bonded_min = float(xs[3]) # if(x.startswith("r_work(re-computed) :")): r_work = float(xs[2]) if(x.startswith("r_free(re-computed) :")): r_free = xs[2] if(x.startswith("bulk_solvent_(k_sol,b_sol) :")): kb_sol = [float(xs[2]), float(xs[3])] if(x.startswith("overall_anisotropic_scale_(b_cart) :")): b_cart = xs[2:] # if(x.startswith("program_name :")): program_name_pub = xs[2] if(x.startswith("year :")): year_pub = xs[2] if(x.startswith("r_work :")): r_work_pub = xs[2] if(x.startswith("r_free :")): r_free_pub = xs[2] if(x.startswith("high_resolution :")): high_resolution_pub = xs[2] if(x.startswith("low_resolution :")): low_resolution_pub = xs[2] if(x.startswith("sigma_cutoff :")): sigma_cutoff_pub = xs[2] ### if(pdbid is not None and r_work is not None): return group_args( pdbid = pdbid , unit_cell = unit_cell , space_group = space_group , n_models = n_models , n_altconf = n_altconf , other = other , dna_rna = dna_rna , amino_acid = amino_acid , water = water , small_molecule = small_molecule , data_label = data_label , high_resolution = high_resolution , low_resolution = low_resolution , completeness_in_range = completeness_in_range , completeness_d_min_inf = completeness_d_min_inf , completeness_6_inf = completeness_6_inf , wilson_b = wilson_b , number_of_reflections = number_of_reflections , test_set_size = test_set_size , test_flag_value = test_flag_value , number_of_Fobs_outliers = number_of_Fobs_outliers, anomalous_flag = anomalous_flag , number_of_atoms = number_of_atoms , adp_min = adp_min , adp_max = adp_max , adp_mean = adp_mean , occ_min = occ_min , occ_max = occ_max , occ_mean = occ_mean , bonds_mean = bonds_mean , bonds_max = bonds_max , bonds_count = bonds_count , angles_mean = angles_mean , angles_max = angles_max , angles_count = angles_count , dihedrals_mean = dihedrals_mean , dihedrals_max = dihedrals_max , dihedrals_count = dihedrals_count , chirality_mean = chirality_mean , chirality_max = chirality_max , chirality_count = chirality_count , planarity_mean = planarity_mean , planarity_max = planarity_max , planarity_count = planarity_count , non_bonded_min = non_bonded_min , r_work = r_work , r_free = r_free , kb_sol = kb_sol , b_cart = b_cart , program_name_pub = program_name_pub , year_pub = year_pub , r_work_pub = r_work_pub , r_free_pub = r_free_pub , high_resolution_pub = high_resolution_pub , low_resolution_pub = low_resolution_pub , sigma_cutoff_pub = sigma_cutoff_pub) else: return None def run(): files = os.listdir('/net/chevy/raid1/afonine/PROJ/pdbmtz/phenix_mvd_2008_DEC_10_17h25') result = [] n_log_files = 0 for i_seq, file in enumerate(files): if(file.endswith('.log')): n_log_files += 1 lines = open(file, "r").readlines() result_ = file_to_object(lines = lines) if(result_ is not None): result.append(result_) if(i_seq%500 == 0): print i_seq print "Total file in directory: ", len(files) print "Total .log files: ", n_log_files print "Total results: ", len(result) return result if (__name__ == "__main__"): result = run() for r in result: if(r.completeness_in_range is not None and r.completeness_in_range > 0.99 and r.completeness_6_inf is not None and r.completeness_6_inf > 0.99): print "%5s %6s %6s %6s %6s %6s %6s %6s %6s %6s %6s" % (r.pdbid, str(r.r_work), str(r.r_free), str(r.high_resolution), str(r.low_resolution), str(r.completeness_in_range), str(r.completeness_6_inf), str(r.number_of_atoms), str(r.year_pub), str(r.r_work_pub),str(r.r_free_pub)) if 0: ### compute a histogram of R-factors for structures at given resolution #rg = [(1.4,1.6),(1.9,2.1),(2.3,2.5),(1.6,1.9),(2.4,2.6),(1.7,1.9),(2.6,2.8),(2.9,3.1),(2.0,2.2)] rg = [(1.55,1.65), ] for pair in rg: r_work = flex.double() r_free = flex.double() d_min = flex.double() delta = flex.double() pdbids = flex.std_string() cir = flex.double() adp_mean = flex.double() for i_seq, r in enumerate(result): if(r.r_work_pub != str(None) and r.r_free_pub != str(None)): rf = float(r.r_free_pub) rw = float(r.r_work_pub) r_work.append(rw) r_free.append(rf) delta.append(rf-rw) d_min.append(r.high_resolution) pdbids.append(r.pdbid) adp_mean.append(r.adp_mean) cir.append(r.completeness_in_range) r_work *= 100. r_free *= 100. delta *= 100. selection = d_min >= pair[0] selection &= d_min <= pair[1] selection &= delta > 0 selection &= r_work > 3. selection &= r_free < 50. r_work = r_work.select(selection) d_min = d_min.select(selection) r_free = r_free.select(selection) delta = delta .select(selection) pdbids = pdbids.select(selection) cir = cir.select(selection) print pair, " %4.1f %4.1f %4.1f %d" % (flex.mean(r_work), flex.mean(r_free), flex.mean(cir)*100., cir.size()) adp_mean = adp_mean.select(selection) for pid, r, d in zip(pdbids, r_work, d_min): print "%s %10.4f %10.4f" % (pid, r, d) h_rw = flex.histogram(data = r_work, n_slots = 10) h_rf = flex.histogram(data = r_free, n_slots = 10) h_d = flex.histogram(data = delta, n_slots = 10) print flex.mean(delta), delta.size() print "Distribution of R-work at resolution %6.2f-%6.2f"%(pair[0], pair[1]) show(h_rw) print "Distribution of R-free %6.2f-%6.2f"%(pair[0], pair[1]) show(h_rf) print "Distribution of R-work-Rfree %6.2f-%6.2f"%(pair[0], pair[1]) show(h_d)