HPCToolkit
files.c
Go to the documentation of this file.
1 // -*-Mode: C++;-*- // technically C99
2 
3 // * BeginRiceCopyright *****************************************************
4 //
5 // $HeadURL$
6 // $Id$
7 //
8 // --------------------------------------------------------------------------
9 // Part of HPCToolkit (hpctoolkit.org)
10 //
11 // Information about sources of support for research and development of
12 // HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'.
13 // --------------------------------------------------------------------------
14 //
15 // Copyright ((c)) 2002-2019, Rice University
16 // All rights reserved.
17 //
18 // Redistribution and use in source and binary forms, with or without
19 // modification, are permitted provided that the following conditions are
20 // met:
21 //
22 // * Redistributions of source code must retain the above copyright
23 // notice, this list of conditions and the following disclaimer.
24 //
25 // * Redistributions in binary form must reproduce the above copyright
26 // notice, this list of conditions and the following disclaimer in the
27 // documentation and/or other materials provided with the distribution.
28 //
29 // * Neither the name of Rice University (RICE) nor the names of its
30 // contributors may be used to endorse or promote products derived from
31 // this software without specific prior written permission.
32 //
33 // This software is provided by RICE and contributors "as is" and any
34 // express or implied warranties, including, but not limited to, the
35 // implied warranties of merchantability and fitness for a particular
36 // purpose are disclaimed. In no event shall RICE or contributors be
37 // liable for any direct, indirect, incidental, special, exemplary, or
38 // consequential damages (including, but not limited to, procurement of
39 // substitute goods or services; loss of use, data, or profits; or
40 // business interruption) however caused and on any theory of liability,
41 // whether in contract, strict liability, or tort (including negligence
42 // or otherwise) arising in any way out of the use of this software, even
43 // if advised of the possibility of such damage.
44 //
45 // ******************************************************* EndRiceCopyright *
46 
47 // This file opens the three types of files that hpcrun uses: .log,
48 // .hpcrun and .hpctrace. The division of labor is that files.c knows
49 // about file names, opens the file and returns a file descriptor.
50 // Everything else just uses the fd.
51 //
52 // Major technical problems: (1) we name files by MPI rank, but we
53 // open the log and trace files before we know their rank. Thus, we
54 // have to open them early and rename them later. (2) we need a
55 // unique id for all files from the same process. (3) on some
56 // systems, gethostid does not return a unique value across all nodes.
57 // So, (hostid, pid) is not always unique to the process. Plus, a
58 // process can exec the same binary (same pid).
59 //
60 // The solution is to use rank, threadid, hostid, pid and a generation
61 // number to make unique names.
62 //
63 // progname-rank-thread-hostid-pid-gen.suffix
64 //
65 // Normally, (hostid, pid) uniquely identifies the process, but not
66 // always. We open() the file with O_EXCL. If that succeeds, then we
67 // win the race and (hostid, pid, gen) is the unique id for this
68 // process. If not, then bump the gen number and try again.
69 //
70 // If this continues to fail for several revs, then start picking
71 // random hostids. Possibly, a process could be exec-ing itself
72 // several times, but more likely either gethostid() or getpid() is
73 // not returning unique values on this system.
74 //
75 // Also, since we rename a file from rank 0 to its actual rank, we use
76 // both an early and late id.
77 //
78 // Note: we use the .log file as the lock for all file names for this
79 // process, for both early and late ids. That is, the process that
80 // opens the file "prog-0-0-host-pid-gen.log" owns all file names of
81 // the form "prog-0-*-host-pid-gen.*" (early id). Also, the process
82 // that opens "prog-rank-0-host-pid-gen.log" owns all file names of
83 // the form "prog-rank-*-host-pid-gen.*" (late id).
84 //
85 // Thus, it is necessary to open the .log file first and also rename
86 // the .log file as the first late name.
87 //
88 // Note: it is important to open() the files with O_EXCL to determine
89 // the winner of the lock. Also, when renaming a file, it's important
90 // to link() the new name and then unlink() the old name if successful
91 // instead of using rename(). link(2) returns EEXIST if the new name
92 // already exists, whereas rename(2) silently overwrites an existing
93 // file.
94 //
95 // Note: it's ok to rename a file with an open file descriptor.
96 //
97 // It would make sense to replace the (hostid, pid, gen) ids with a
98 // single random number of some length, again testing with O_EXCL and
99 // using a different value if necessary.
100 
101 
102 //***************************************************************
103 // global includes
104 //***************************************************************
105 
106 #include <errno.h> // errno
107 #include <fcntl.h> // open
108 #include <limits.h> // PATH_MAX
109 #include <stdio.h> // sprintf
110 #include <stdlib.h> // realpath
111 #include <string.h> // strerror
112 #include <unistd.h> // gethostid
113 #include <sys/time.h> // gettimeofday
114 #include <sys/types.h> // struct stat
115 #include <sys/stat.h> // stat
116 #include <stdbool.h>
117 
118 
119 //***************************************************************
120 // local includes
121 //***************************************************************
122 
123 #include "env.h"
124 #include "disabled.h"
125 #include "files.h"
126 #include "messages.h"
127 #include "thread_data.h"
128 #include "loadmap.h"
129 #include "sample_prob.h"
130 
131 #include <lib/prof-lean/spinlock.h>
132 #include <lib/support-lean/OSUtil.h>
133 
134 
135 //***************************************************************
136 // macros
137 //***************************************************************
138 
139 // directory/progname-rank-thread-hostid-pid-gen.suffix
140 #define FILENAME_TEMPLATE "%s/%s-%06u-%03d-" HOSTID_FORMAT "-%u-%d.%s"
141 
142 #define FILES_RANDOM_GEN 4
143 #define FILES_MAX_GEN 11
144 
145 #define FILES_EARLY 0x1
146 #define FILES_LATE 0x2
147 
148 struct fileid {
149  int done;
150  long host;
151  int gen;
152 };
153 
154 
155 //***************************************************************
156 // forward declarations
157 //***************************************************************
158 
159 static void hpcrun_rename_log_file_early(int rank);
160 
161 
162 //***************************************************************
163 // local data
164 //***************************************************************
165 
166 static char default_path[PATH_MAX] = {'\0'};
167 static char output_directory[PATH_MAX] = {'\0'};
168 static char executable_name[PATH_MAX] = {'\0'};
169 static char executable_pathname[PATH_MAX] = {'\0'};
170 
171 // These variables are protected by the files lock.
172 // Opening or renaming a file must acquire the lock.
173 
175 static pid_t mypid = 0;
176 static struct fileid earlyid;
177 static struct fileid lateid;
178 static int log_done = 0;
179 static int log_rename_done = 0;
180 static int log_rename_ret = 0;
181 
182 
183 //***************************************************************
184 // private operations
185 //***************************************************************
186 
187 // In general, the external functions acquire the files lock and the
188 // internal functions require that the lock is already held.
189 
190 // Reset the file ids on first use (pid 0) or after fork.
191 static void
193 {
194  pid_t cur_pid = getpid();
195 
196  if (mypid != cur_pid) {
197  mypid = cur_pid;
198  earlyid.done = 0;
200  earlyid.gen = 0;
201  lateid = earlyid;
202  log_done = 0;
203  log_rename_done = 0;
204  log_rename_ret = 0;
205  }
206 }
207 
208 
209 // Replace "id" with the next unique id if possible. Normally,
210 // (hostid, pid, gen) works after one or two iterations. To be extra
211 // robust (eg, hostid is not unique), at some point, give up and pick
212 // a random hostid.
213 //
214 // Returns: 0 on success, else -1 on failure.
215 //
216 static int
218 {
219  struct timeval tv;
220  int fd;
221 
222  if (id->done || id->gen >= FILES_MAX_GEN) {
223  // failure, out of options
224  return -1;
225  }
226 
227  id->gen++;
228  if (id->gen >= FILES_RANDOM_GEN) {
229  // give up and use a random host id
230  fd = open("/dev/urandom", O_RDONLY);
231  if (fd >= 0) {
232  read(fd, &id->host, sizeof(id->host));
233  close(fd);
234  }
235  gettimeofday(&tv, NULL);
236  id->host += (tv.tv_sec << 20) + tv.tv_usec;
237  id->host &= 0x00ffffffff;
238  }
239 
240  return 0;
241 }
242 
243 
244 // Open the file with O_EXCL and try the next file id if it already
245 // exists. The log and trace files are opened early, the profile file
246 // (hpcrun) is opened late. Must hold the files lock.
247 
248 // Returns: file descriptor, else die on failure.
249 //
250 static int
251 hpcrun_open_file(int rank, int thread, const char *suffix, int flags)
252 {
253  char name[PATH_MAX];
254  struct fileid *id;
255  int fd, ret;
256 
257  // If not recording data for this process, then open /dev/null.
258  if (! hpcrun_sample_prob_active()) {
259  fd = open("/dev/null", O_WRONLY);
260  return fd;
261  }
262 
263  id = (flags & FILES_EARLY) ? &earlyid : &lateid;
264  for (;;) {
265  errno = 0;
266  ret = snprintf(name, PATH_MAX, FILENAME_TEMPLATE, output_directory,
267  executable_name, rank, thread, id->host, mypid, id->gen, suffix);
268  if (ret >= PATH_MAX) {
269  fd = -1;
270  errno = ENAMETOOLONG;
271  break;
272  }
273  fd = open(name, O_WRONLY | O_CREAT | O_EXCL, 0644);
274  if (fd >= 0) {
275  // success
276  break;
277  }
278  if (errno != EEXIST || hpcrun_files_next_id(id) != 0) {
279  // failure, out of options
280  fd = -1;
281  break;
282  }
283  }
284  id->done = 1;
285  if (flags & FILES_EARLY) {
286  // late id starts where early id is chosen
287  lateid = earlyid;
288  lateid.done = 0;
289  }
290 
291  // Failure to open is a fatal error.
292  if (fd < 0) {
293  hpcrun_abort("hpctoolkit: unable to open %s file: '%s': %s",
294  suffix, name, strerror(errno));
295  }
296 
297  return fd;
298 }
299 
300 
301 // Rename the file from MPI rank 0 and early id to new rank and late
302 // id (rename is always late). Must hold the files lock.
303 //
304 // Note: must use link(2) and unlink(2) instead of rename(2) to
305 // atomically test if the new file exists. rename() silently
306 // overwrites a previous file.
307 //
308 // Returns: 0 on success, else -1 on failure.
309 static int
310 hpcrun_rename_file(int rank, int thread, const char *suffix)
311 {
312  char old_name[PATH_MAX], new_name[PATH_MAX];
313  int ret;
314 
315  // Not recoding data for this process.
316  if (! hpcrun_sample_prob_active()) {
317  return 0;
318  }
319 
320  // Old and new names are the same.
321  if (rank == 0 && earlyid.host == lateid.host && earlyid.gen == lateid.gen) {
322  return 0;
323  }
324 
325  snprintf(old_name, PATH_MAX, FILENAME_TEMPLATE, output_directory,
326  executable_name, 0, thread, earlyid.host, mypid, earlyid.gen, suffix);
327  for (;;) {
328  errno = 0;
329  ret = snprintf(new_name, PATH_MAX, FILENAME_TEMPLATE, output_directory,
330  executable_name, rank, thread, lateid.host, mypid, lateid.gen, suffix);
331  if (ret >= PATH_MAX) {
332  ret = -1;
333  errno = ENAMETOOLONG;
334  break;
335  }
336  ret = link(old_name, new_name);
337  if (ret == 0) {
338  // success
339  unlink(old_name);
340  break;
341  }
342  if (errno != EEXIST || hpcrun_files_next_id(&lateid) != 0) {
343  // failure, out of options
344  ret = -1;
345  break;
346  }
347  }
348  lateid.done = 1;
349 
350  // Failure to rename is a loud warning.
351  if (ret < 0) {
352  EEMSG("hpctoolkit: unable to rename %s file: '%s' -> '%s': %s",
353  suffix, old_name, new_name, strerror(errno));
354  EMSG("hpctoolkit: unable to rename %s file: '%s' -> '%s': %s",
355  suffix, old_name, new_name, strerror(errno));
356  }
357 
358  return ret;
359 }
360 
361 
362 //***************************************************************
363 // interface operations
364 //***************************************************************
365 
366 const char*
368 {
369  const char* load_name = hpcrun_loadmap_findLoadName(executable_name);
370 
371  return load_name ? load_name : executable_name;
372 }
373 
374 
375 const char *
377 {
378  return executable_name;
379 }
380 
381 
382 void
384 {
385  char *path = getenv(HPCRUN_OUT_PATH);
386 
387  // compute path for default measurement directory
388  if (path == NULL || strlen(path) == 0) {
389  const char *jid = OSUtil_jobid();
390  if (jid == NULL) {
391  sprintf(default_path, "./hpctoolkit-%s-measurements", executable_name);
392  } else {
393  sprintf(default_path, "./hpctoolkit-%s-measurements-%s", executable_name, jid);
394  }
395  path = default_path;
396  // N.B.: safe to skip checking for errors as realpath will notice them
397  }
398 
399  int ret = mkdir(path, 0755);
400  if (ret != 0 && errno != EEXIST) {
401  hpcrun_abort("hpcrun: could not create output directory `%s': %s",
402  path, strerror(errno));
403  }
404 
405  char* rpath = realpath(path, output_directory);
406  if (!rpath) {
407  hpcrun_abort("hpcrun: could not access directory `%s': %s", path, strerror(errno));
408  }
409 }
410 
411 
412 const char *
414 {
415  return output_directory;
416 }
417 
418 
419 void
421 {
422  strncpy(executable_name, basename(execname), sizeof(executable_name));
423 
424  if ( ! realpath(execname, executable_pathname) ) {
425  strncpy(executable_pathname, execname, sizeof(executable_pathname));
426  }
427 }
428 
429 
430 // Returns: file descriptor for log file.
431 int
433 {
434  int ret;
435 
436  spinlock_lock(&files_lock);
439  if (ret >= 0) {
440  log_done = 1;
441  }
442  spinlock_unlock(&files_lock);
443 
444  return ret;
445 }
446 
447 
448 // Returns: file descriptor for trace file.
449 int
451 {
452  int ret;
453 
454  TMSG(TRACE, "Opening trace file for %d", thread);
455  spinlock_lock(&files_lock);
456  TMSG(TRACE, "Calling files init for %d", thread);
458  TMSG(TRACE, "About to open file for %d", thread);
460  TMSG(TRACE, "Back from open file %d, ret code = %d", thread, ret);
461  spinlock_unlock(&files_lock);
462  TMSG(TRACE, "Unlocked file lock for %d", thread);
463 
464  return ret;
465 }
466 
467 // Returns: file descriptor for profile (hpcrun) file.
468 int
469 hpcrun_open_profile_file(int rank, int thread)
470 {
471  int ret;
472 
473  spinlock_lock(&files_lock);
476  ret = hpcrun_open_file(rank, thread, HPCRUN_ProfileFnmSfx, FILES_LATE);
477  spinlock_unlock(&files_lock);
478 
479  return ret;
480 }
481 
482 
483 // Note: we use the log file as the lock for the file names, so we
484 // need to rename the log file as the first late action. Since this
485 // is out of sequence, we save the return value and return it when the
486 // log rename is actually called. Must hold the files lock.
487 //
488 static void
490 {
491  if (log_done && !log_rename_done) {
493  log_rename_done = 1;
494  }
495 }
496 
497 
498 // Returns: 0 on success, else -1 on failure.
499 int
501 {
502  spinlock_lock(&files_lock);
504  spinlock_unlock(&files_lock);
505 
506  return log_rename_ret;
507 }
508 
509 
510 // Returns: 0 on success, else -1 on failure.
511 int
512 hpcrun_rename_trace_file(int rank, int thread)
513 {
514  int ret;
515 
516  TMSG(TRACE, "Renaming trace file for rank %d, thread %d", rank, thread);
517  spinlock_lock(&files_lock);
518  TMSG(TRACE, "(Rename) Spin lock acquired for (R:%d, T:%d)", rank, thread);
520  TMSG(TRACE, "Rename log file early (R:%d, T:%d)", rank, thread);
521  ret = hpcrun_rename_file(rank, thread, HPCRUN_TraceFnmSfx);
522  TMSG(TRACE, "Back from rename trace file for(R:%d, T:%d), retcode = %d", rank, thread, ret);
523  spinlock_unlock(&files_lock);
524  TMSG(TRACE, "(rename) Spin lock released for (R:%d, T:%d)", rank, thread);
525 
526  return ret;
527 }
static struct fileid earlyid
Definition: files.c:176
void hpcrun_files_set_executable(char *execname)
Definition: files.c:420
static int log_rename_done
Definition: files.c:179
static void hpcrun_files_init(void)
Definition: files.c:192
const char * hpcrun_files_output_directory()
Definition: files.c:413
void hpcrun_files_set_directory()
Definition: files.c:383
const char * hpcrun_loadmap_findLoadName(const char *name)
Definition: loadmap.c:315
static void spinlock_unlock(spinlock_t *l)
Definition: spinlock.h:96
static spinlock_t files_lock
Definition: files.c:174
static const char HPCRUN_TraceFnmSfx[]
Definition: hpcrun-fmt.h:96
const char * hpcrun_files_executable_name()
Definition: files.c:376
int gen
Definition: files.c:151
#define hpcrun_abort(...)
Definition: messages.h:102
#define FILENAME_TEMPLATE
Definition: files.c:140
static const char HPCRUN_ProfileFnmSfx[]
Definition: hpcrun-fmt.h:93
long OSUtil_hostid()
Definition: OSUtil.c:162
#define FILES_MAX_GEN
Definition: files.c:143
int hpcrun_sample_prob_active(void)
Definition: sample_prob.c:193
Definition: files.c:148
int hpcrun_open_log_file(void)
Definition: files.c:432
static char default_path[PATH_MAX]
Definition: files.c:166
#define EMSG
Definition: messages.h:70
#define FILES_RANDOM_GEN
Definition: files.c:142
int hpcrun_open_profile_file(int rank, int thread)
Definition: files.c:469
static char executable_name[PATH_MAX]
Definition: files.c:168
int hpcrun_open_trace_file(int thread)
Definition: files.c:450
static struct fileid lateid
Definition: files.c:177
static const char HPCRUN_LogFnmSfx[]
Definition: hpcrun-fmt.h:99
const char * HPCRUN_OUT_PATH
Definition: env.c:50
static pid_t mypid
Definition: files.c:175
static void spinlock_lock(spinlock_t *l)
Definition: spinlock.h:111
static int hpcrun_rename_file(int rank, int thread, const char *suffix)
Definition: files.c:310
const char * hpcrun_files_executable_pathname()
Definition: files.c:367
static char executable_pathname[PATH_MAX]
Definition: files.c:169
static char execname[PATH_MAX]
Definition: main.c:214
static char output_directory[PATH_MAX]
Definition: files.c:167
#define TMSG(f,...)
Definition: messages.h:93
static int log_done
Definition: files.c:178
int mkdir(const char *dir)
Definition: FileUtil.cpp:289
#define FILES_LATE
Definition: files.c:146
ssize_t MONITOR_EXT_WRAP_NAME() read(int fd, void *buf, size_t count)
Definition: io-over.c:152
int hpcrun_rename_log_file(int rank)
Definition: files.c:500
const char * OSUtil_jobid()
Definition: OSUtil.c:100
#define EEMSG(...)
Definition: messages.h:90
#define NULL
Definition: ElfHelper.cpp:85
static int log_rename_ret
Definition: files.c:180
long host
Definition: files.c:150
string basename(const char *fName)
Definition: FileUtil.cpp:90
static int hpcrun_open_file(int rank, int thread, const char *suffix, int flags)
Definition: files.c:251
int done
Definition: files.c:149
int hpcrun_rename_trace_file(int rank, int thread)
Definition: files.c:512
static int hpcrun_files_next_id(struct fileid *id)
Definition: files.c:217
static void hpcrun_rename_log_file_early(int rank)
Definition: files.c:489
#define SPINLOCK_UNLOCKED
Definition: spinlock.h:84
#define FILES_EARLY
Definition: files.c:145