Source code for atomdnn.descriptor

import os
import time
from ase.io import read,write
import atomdnn
import re
import glob
import shutil
import random

def sorted_alphanumeric(filenames):
    """
    Sort file names in alphanumeric order.
    """
    convert = lambda text: int(text) if text.isdigit() else text.lower()
    alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ] 
    return sorted(filenames, key=alphanum_key)



[docs]def get_filenames(file_path,file_name):
    """
    Get the names of a set of files that match the patten given by file_name. 
    Return the filenames having the format of 'string1*string2',
    in which string1 and string2 can be any strings amd * must be an integer.
    """
    true_path = os.path.abspath(os.path.join(file_path,file_name))
    if '*' in file_name:
        name_mask = file_name.split('*')
        if len(name_mask)!=2:
            raise ValueError('The file name \'%s\' can only has one *.'%file_name)
        filenames = glob.glob(true_path) # get the files matching the given pattern
      
        # make sure '*' only replace numbers, take out non-related files
        filenames = [filenames[i] for i in range(len(filenames)) \
                     if os.path.basename(filenames[i]).replace(name_mask[0],'').replace(name_mask[1],'').isdigit()]
        filenames = sorted_alphanumeric(filenames) # sort the files
        if len(filenames)==0:
            raise FileNotFoundError('Cannot find any files \'%s\' in %s'%(file_name,file_path))
        return filenames
    else:
        return [true_path]


    
[docs]def create_lmp_input(descriptor,descriptors_path):
    """
    Creates a lammps input file for computing descriptors and derivatives
    
    Args:
        descritpor (dictionary): the parameter dictionary of the descriptor 

    """
    infile = os.path.join(descriptors_path,'in.gen_descriptors')
    lmpinfile = open(infile,'w')    
    G2_parameters = ''
    for i in range(len(descriptor['etaG2'])):
        G2_parameters += str(descriptor['etaG2'][i])+' '
        
    G4_parameters = ''
    for i in range(len(descriptor['etaG4'])):
        G4_parameters += str(descriptor['etaG4'][i])+' '
        
    zeta_parameters = ''
    for i in range(len(descriptor['zeta'])):
        zeta_parameters += str(descriptor['zeta'][i])+' '
        
    lambda_parameters = ''
    for i in range(len(descriptor['lambda'])):
        lambda_parameters += str(descriptor['lambda'][i])+' '
        
    compute_fp_line = 'compute 1 all fingerprints ${cutoff} etaG2 ' + G2_parameters \
                       + 'etaG4 ' + G4_parameters \
                       + 'zeta ' + zeta_parameters \
                       + 'lambda ' + lambda_parameters + 'end\n'
    
    compute_der_line = 'compute 2 all derivatives ${cutoff} etaG2 '+ G2_parameters \
                       + 'etaG4 ' + G4_parameters \
                       + 'zeta ' + zeta_parameters \
                       + 'lambda ' + lambda_parameters + 'end\n'

    dump_fp_line =  'dump dump_fingerprints all custom 1 ${fp_filename} id type c_1[*]\n'+ \
                    'dump_modify dump_fingerprints sort id format float %20.10g\n'

    dump_der_line = 'dump dump_primes all local 1 ${der_filename} c_2[*]\n'+ \
                    'dump_modify dump_primes format float %20.10g\n'

    
    if atomdnn.compute_force:
        compute_line = compute_fp_line + compute_der_line
        dump_line = dump_fp_line + dump_der_line
    else:
        compute_line = compute_fp_line
        dump_line = dump_fp_line
        
    lmpinfile.writelines('clear\n'+
                         'dimension 3\n'+
                         'boundary p p p\n'+
                         'units metal\n'+
                         'atom_style atomic\n'+
                         'variable cutoff equal ' + str(descriptor['cutoff']) + '\n' +
                         'read_data ${lmpdatafile}\n' +
                         'mass * 1.0\n'+
                         'log ${logfile}\n'+
                         'pair_style zero ${cutoff} nocoeff\n'+
                         'pair_coeff * * 1.0 1.0\n'+
                         'neighbor 0.0 bin\n' +
                          compute_line + 
                          dump_line +
                         'fix NVE all nve\n'+
                         'run 0')

    

[docs]def create_descriptors(elements, xyzfiles, descriptor, \
                       format='extxyz', descriptors_path=None, descriptor_filename='dump_fp.*', der_filename='dump_der.*', \
                       start_file_id=1,image_num=None, skip=0, keep_lmpfiles=False, create_data = True, verbose=False, silent=False, **kwargs):
    
    """
    Read extxyz files as inputs and create descriptors and their derivatives w.r.t. atom coordinates.

    Args:
       elements: a list of elements, e.g. ['C','O','H'], make sure the sequence is consistant 
       xyzfiles: a serials of extxyz files of input atomic structures, wildcard * is used for files numerically ordered
       descritpor (dictionary): the parameter dictionary of the descriptor
       format: 'lammp-data','extxyz','vasp' etc. See complete list on https://wiki.fysik.dtu.dk/ase/ase/io/io.html#ase.io.read. 'extxyz' is recommanded.
       descriptors_path: a new directory where descirptors will be generated, default is './descriptors'
       descriptor_filename: default is 'dump_fp.*', numerically ordered
       der_filename: default is 'dump_der.*', numerically ordered
       start_file_id(int): starting id for descriptor and derivative files
       image_num: number of images that will be used, if it's None then read all files specified by xyzfile_name
       skip(int): skip some images
       keep_lmpfiles(bool): set to True if want to keep the lammps input and datafiles used for creating descriptors
       create_data(bool): set to True if want to create Data object using the generated descriptors
       verbose(bool): set to True if want to print out the extxyz file names used for creating descriptors
       kwargs: used to pass optional file styles
    """

    if descriptors_path is None:
        descriptors_path = str(hash(random.random()))

    os.makedirs(descriptors_path, exist_ok=True)
    

    if not isinstance(descriptor, dict):
        raise TypeError("descriptor shoud be given as a dictionary")
    
    if '*' in descriptor_filename:
        fp_name_mask = descriptor_filename.split('*')
        if len(fp_name_mask)!=2:
            raise ValueError('The descriptor_filename can only has one *.')  
    if '*' in der_filename:
        der_name_mask = der_filename.split('*')
        if len(der_name_mask)!=2:
            raise ValueError('The der_filename can only has one *.')

    xyzfile_path = os.path.dirname(os.path.abspath(xyzfiles))
    xyzfile_name = os.path.basename(os.path.abspath(xyzfiles))
        
    xyzfile_names = get_filenames(xyzfile_path,xyzfile_name)[skip:]

    if image_num is not None and image_num<len(xyzfile_names):
        nfiles = image_num
    else:
        nfiles = len(xyzfile_names)

    if nfiles >1 and '*' not in descriptor_filename:
        raise ValueError('Multiple extxyz files found, use * in descriptor_filename.')
    if nfiles >1 and '*' not in der_filename:
        raise ValueError('Multiple extxyz files found, use * in der_filename.')

    # check existing files in descriptor directory 
    if len(os.listdir(descriptors_path))>0:
        while True:
            del_files = input('There are existing files in %s, do you want to first delete the files, y/n? '%descriptors_path)
            if del_files=='y':
                for f in os.listdir(descriptors_path):
                    os.remove(os.path.join(descriptors_path,f))
                break
            elif del_files=='n':
                break
    
    if atomdnn.compute_force and silent==False:
        print('Start creating fingerprints and derivatives for %i files named \'%s\' ...'% (nfiles,descriptor_filename))
    elif silent==False:
        print('Start creating fingerprints for %i files named \'%s\' (no derivatives, set atomdnn.compute_force to True for derivatives) ...'% (nfiles,descriptor_filename))

    #os.chdir(descriptors_path) # switch to descriptor directory
    start_time = time.time()
    for i in range(nfiles):

        if format!='lammps-data':
            patom = read(xyzfile_names[i],format=format,**kwargs)
        
        create_lmp_input(descriptor,descriptors_path) # create lammps input file named 'in.gen_descriptors'

        if '*' in descriptor_filename:
            fp_fname = fp_name_mask[0] + str(i+start_file_id) + fp_name_mask[1]
        else:
            fp_fname = descriptor_filename
        if '*' in der_filename:
            der_fname = der_name_mask[0] + str(i+start_file_id) + der_name_mask[1]
        else:
            der_fname = der_filename

        lmpdatafile = 'lmpdatafile.'+str(i+start_file_id)
        lmpdatafile = os.path.join(descriptors_path,lmpdatafile)
        logfile = 'log.'+str(i+start_file_id)
        logfile = os.path.join(descriptors_path,logfile)


        if format=='lammps-data':
            shutil.copyfile(xyzfile_names[i],lmpdatafile)
        else:
            # use specorder to make sure the type of atoms are consistant
            write(lmpdatafile, patom, specorder=elements, format='lammps-data',atom_style='atomic') # create lammps datafile
        
        # lammps run command
        fp_pfname = os.path.join(descriptors_path,fp_fname)
        der_pfname = os.path.join(descriptors_path,der_fname)
        infile = os.path.join(descriptors_path,'in.gen_descriptors')
        lmp_cmd = os.environ['lmpexe']  + ' -in ' + infile \
                          + ' -var fp_filename ' + fp_pfname  \
                          + ' -var der_filename ' + der_pfname \
                          + ' -var lmpdatafile ' + lmpdatafile \
                          + ' -var logfile ' + logfile
      
        status = os.system(lmp_cmd) # run lammps
        if status!=0:
            raise RuntimeError('LAMMPS returns error, find error message in jupyter notebook terminal.' +
                               'To check problems, set keep_lmpfiles=True in create_descriptors function,' +
                               'and then check lammps input and data files in descriptor directory.')
        
        if not keep_lmpfiles:
            os.remove(lmpdatafile)
            os.remove(logfile)

        if verbose and silent==False:
            if atomdnn.compute_force:
                print('  file-%i: read atoms from \'%s\' and created descriptors in \'%s\' and derivatives in \'%s\'' \
                      % (i+1,os.path.basename(xyzfile_names[i]),fp_fname,der_fname))
            else:
                print('  file-%i: read atoms from \'%s\' and created descriptors in \'%s\'' \
                      % (i+1,os.path.basename(xyzfile_names[i]),fp_fname))
        if i > 0 and int((i+1)%10)==0 and silent==False:
            print ('  so far finished for %d images ...' % (i+1),flush=True)

    if not keep_lmpfiles:
        os.remove(infile)
        os.remove('log.lammps')

    if silent==False:
        print('It took %.2f seconds.'%(time.time()-start_time),flush=True)
        if atomdnn.compute_force:
            print('The fingerprints files \'%s\' and derivatives files \'%s\' are saved in folder \'%s\'.'%(descriptor_filename,der_filename,descriptors_path))
        else:
            print('The fingerprints files \'%s\' are saved in folder \'%s\'.'%(descriptor_filename,descriptors_path))
    if create_data is True:
        if silent is False:
            print('\nUsing the generated descriptors to create and return an AtomDNN Data object.')

        from atomdnn.data import Data
        # create a Data object using the generated descriptors        
        atomdnn_data = Data(descriptors_path,descriptor_filename, der_filename, xyzfile_path, xyzfile_name,format,image_num,skip,verbose,silent,**kwargs)

        return atomdnn_data
    


def get_num_fingerprints(descriptor,elements):
    """
    Compute the total number of fingerprints.
    
    Args:
        descritpor(dictionary): parameters that defines the descriptors
        elements(list): list of elements

    Returns:
        total number of fingerprints
    """

    ntypes = len(elements)

    if descriptor['name'] == 'acsf':
        ntypes_combinations = ntypes*(ntypes+1)/2;
        n_etaG2 = len(descriptor['etaG2'])
        n_etaG4 = len(descriptor['etaG4'])
        n_zeta = len(descriptor['zeta'])
        n_lambda = len(descriptor['lambda'])
        num_fingerprints = int(n_etaG2*ntypes + n_lambda*n_zeta*n_etaG4*ntypes_combinations + ntypes);

    return num_fingerprints