8:00:15 AM PDT - Fri, May 2nd 2014 |
|
Hi;
I've built NWChem with OpenMPI 1.6.5 support according to the docs. I run using mpirun. The cluster has 16-core nodes. However, once PBS schedules the jobs, all the process (try to) run on the first node, rather than 16 per node. So even thigh I have, say, two nodes assigned by the scheduler all 32 NWChem instances run on the first node. Or try to, the job runs out of memory. AFAIK I compiled NWChem properly, and have installed it in a network accessible location.
What, likely simple, thing have I overlooked? Thanks,
Steve
Excerpt from PBS submit file:
NPROCS=`wc -l < $PBS_NODEFILE`
module load nwchem
module load openmpi
mpirun --hostfile $PBS_NODEFILE -np $NPROCS nwchem input.dat > output.dat
The script used to build NWChem is :
- !/bin/csh
setenv NWCHEM_MODULES all
setenv NWCHEM_TOP /home/admin/root/src/nwchem-6.3.revision2-src.2013-10-17
setenv NWCHEM_TARGET LINUX64
setenv LARGE_FILES TRUE
setenv LIB_DEFINES -DDFLT_TOT_MEM=134217728
setenv USE_NOFSCHECK TRUE
setenv TCGRSH /usr/bin/ssh
setenv FC ifort
setenv CC icc
setenv USE_MPI y
setenv USE_MPIF y
setenv USE_MPIF4 y
- setenv LIBMPI "-L/usr/lib64 -lmca_common_sm -lmpi_f77 -lmpi -lopen-pal -lopen-trace-format -lvt-hyb -lvt-mpi-unify -lvt -lmpi_cxx -lmpi_f90 -lompitrace -lopen-rte -lotfaux -lvt-mpi -lvt-mt"
setenv LIBMPI "-L/usr/lib64 -lmca_common_sm -lmpi_f77 -lmpi -lmpi_cxx -lmpi_f90 -lompitrace -lopen-rte -lotfaux -ldl -Wl,--export-dynamic -lnsl -lutil"
setenv MPI_BASEDIR /usr/local/openmpi/openmpi-1.6.5/intel-14.0.1
setenv MPI_INCLUDE $MPI_BASEDIR/include
setenv MPI_LIB $MPI_BASEDIR/lib
setenv IB_HOME "/usr"
setenv IB_INCLUDE "$IB_HOME/include"
setenv IB_LIB "$IB_HOME/lib64"
setenv IB_LIB_NAME "-libumad -lpthread"
setenv ARMCI_NETWORK OPENIB
module load intel/14.0.1
module load openmpi/1.6.5/intel/14.0.1
- setenv BLASOPT "-L/zhome/Apps/intel/composerxe/mkl/lib/intel64/ -lmkl_blas95_ilp64 -lmkl_blas95_lp64 -lmkl_lapack95_lp64 -lbmkl_lapack95_ilp64"
echo "build time, here we go."
printenv
cd $NWCHEM_TOP/src
- make realclean;
make >& make.log2
[root@hbar1 src]#
|