TAP_support/ADFirstAidKit/adOMP.c

b4daa24319 Shre*0001 #include <stdio.h>
                0002 #include <stdlib.h>
                0003 #include "omp.h"
                0004 #include "adStack.h"
                0005 #include "adOMP.h"
                0006
                0007 /*
                0008  Support functions for AD of OpenMP-parallel programs.  This is mostly to
                0009 ensure that the work is distributed among threads in the same way during the
                0010 forward and reverse sweep.  This is needed to ensure correct data flow, and to
                0011 enable the threads to use only their own threadprivate push/pop stack.
                0012 */
                0013
                0014 /************ ensure OpenMP-enabled stack is used ***********/
                0015
                0016 void assertSafeStack() {
                0017   if(!stackIsThreadSafe()) {
                0018     printf("ERROR: OpenMP-parallel derivative program uses non-OpenMP ADFirstAidKit.\n") ;
                0019     printf("Update to the latest ADFirstAidKit and compile it with openmp flags enabled.\n") ;
                0020     exit(1) ;
                0021   }
                0022 }
                0023
                0024 /******************** static OpenMP schedule *****************/
                0025 /* This is the "easy" option: the static schedule distributes work among threads
                0026    in a deterministic way. If the same schedule is used in the fwd and rev code,
                0027    everything is fine. There is no overhead, no recording, no storage needed to
                0028    do this, but it only works for static schedules. */
                0029
                0030 void getStaticSchedule(int lo, int hi, int stride, int* threadlo, int* threadhi) {
                0031 /* Static OpenMP scheduler, identical to what LLVM would use. Each thread gets
                0032    one chunk of consecutive iterations. The number of iterations per chunk is
                0033    aproximately trip_count/num_threads. If the trip count can not be evenly
                0034    divided among threads, the first few threads get one extra iteration.
                0035    As long as the number of threads stays constant, and when called by the
                0036    same thread, this subroutine will always return the same threadstart and
                0037    threadend when given the same imin,imax,istride as input. */
                0038
                0039 /* formula to compute the number of iterations, F90 standard sec 8.1.4.4.1 */
                0040   int trip_count = (int)((hi-lo+stride)/stride) ;
                0041   if(trip_count < 0) trip_count = 0 ;
                0042
                0043   int nth = omp_get_num_threads() ;
                0044   int tid = omp_get_thread_num() ;
                0045
                0046   if(trip_count < nth) {
                0047     /* fewer iterations than threads. some threads will get one iteration,
                0048        the other threads will get nothing. */
                0049     if(tid < trip_count) {
                0050       /* do one iteration */
                0051       *threadlo = lo + tid * stride ;
                0052       *threadhi = *threadlo ;
                0053     }
                0054     else {
                0055       /* do nothing */
                0056       *threadhi = 0 ;
                0057       *threadlo = *threadhi + stride ;
                0058     }
                0059   }
                0060   /* at least one iteration per thread. since the total number of iterations may
                0061      not be evenly dividable by the number of threads, there will be a few extra
                0062      iterations. the first few threads will each get one of those, which results
                0063      in some offsetts that are applied to the start and end of the chunks. */
                0064   else {
                0065     int chunksize = trip_count / nth ;
                0066     int extras = trip_count % nth ;
                0067     int tidextras, incr ;
                0068     if(tid < extras) {
                0069       tidextras = tid ;
                0070       incr = 0 ;
                0071     }
                0072     else {
                0073       tidextras = extras ;
                0074       incr = stride ;
                0075     }
                0076     *threadlo = lo + (tid*chunksize + tidextras) * stride ;
                0077     *threadhi = *threadlo + chunksize * stride - incr ;
                0078   }
                0079   assertSafeStack() ;
                0080 }
                0081
                0082 /****************** dynamic OpenMP schedule *****************/
                0083 /* This is the more challenging situation: The user wants a dynamic/auto/guided
                0084    schedule, which is not deterministic. We need to record how the workload was
                0085    distributed among threads, so that the reverse sweep can replay this.
                0086    Doing this requires a few integer operations per loop iteration, and a push
                0087    of two integers (chunk start and end) for each new chunk of work that is
                0088    given to a thread. If the memory consumption of this recording is a problem,
                0089    consider using larger chunks or a static schedule. */
                0090 static int numChunks, previousIter, thisChunkStart ;
                0091 static char isFirstIter ;
                0092 #pragma omp threadprivate(numChunks, previousIter, isFirstIter, thisChunkStart)
                0093
                0094 void initDynamicSchedule() {
                0095   isFirstIter = 1 ;
                0096   numChunks = 0 ;
                0097   assertSafeStack() ;
                0098 }
                0099
                0100 void recordDynamicSchedule(int counter, int stride) {
                0101   if(isFirstIter) {
                0102     thisChunkStart = counter ;
                0103     isFirstIter = 0 ;
                0104     numChunks ++ ;
                0105   }
                0106   else {
                0107     if(previousIter + stride != counter) {
                0108       pushInteger4(thisChunkStart) ;
                0109       pushInteger4(previousIter) ;
                0110       thisChunkStart = counter ;
                0111       numChunks ++ ;
                0112     }
                0113   }
                0114   previousIter = counter ;
                0115 }
                0116
                0117 void finalizeDynamicSchedule() {
                0118   pushInteger4(thisChunkStart) ;
                0119   pushInteger4(previousIter) ;
                0120   pushInteger4(numChunks) ;
                0121 }
                0122
                0123 /****************** INTERFACE CALLED FROM FORTRAN *******************/
                0124
                0125 void assertsafestack_() {
                0126   assertSafeStack() ;
                0127 }
                0128
                0129 void getstaticschedule_(int* lo, int* hi, int* stride, int* threadlo, int* threadhi) {
                0130   getStaticSchedule(*lo, *hi, *stride, threadlo, threadhi) ;
                0131 }
                0132
                0133 void initdynamicschedule_() {
                0134   initDynamicSchedule() ;
                0135 }
                0136
                0137 void recorddynamicschedule_(int* counter, int* stride) {
                0138   recordDynamicSchedule(*counter, *stride) ;
                0139 }
                0140
                0141 void finalizedynamicschedule_() {
                0142   finalizeDynamicSchedule() ;
                0143 }