next up previous index
Next: Deadlocks Up: Message Passing Interface Previous: Exercise 2

Parallel Debugging

gustav@sp20:../MPI 10:28:03 !558 $ cat bad_life.c
#include <stdio.h>
#include <mpi.h>

void main(int argc, char *argv[])
{
  int taskid;
  MPI_Status stat;

  /* Find out number of tasks/nodes. */
  MPI_Init (&argc, &argv);
  MPI_Comm_rank(MPI_COMM_WORLD, &taskid);
  
  if ( (taskid % 2) == 0 ) {
    char *send_message = NULL;

    send_message = (char *) malloc(1);
    strcpy(send_message, "Forty Two");
    MPI_Send(send_message, 1, MPI_CHAR, taskid+1, 0, MPI_COMM_WORLD);
    free(send_message);
  } else {
    char *recv_message = NULL;

    MPI_Recv(recv_message, 1, MPI_CHAR, taskid-1, 0, MPI_COMM_WORLD, &stat);
    printf("The answer is %s\n", recv_message);
    free(recv_message);
  }

  printf("Task %d complete.\n", taskid);
  MPI_Finalize();
  exit(0);
}
gustav@sp20:../MPI 10:28:06 !559 $

Compile and link

gustav@sp20:../MPI 10:28:06 !559 $ mpcc -g -o bad_life bad_life.c
gustav@sp20:../MPI 10:29:11 !560 $

Run

gustav@sp20:../MPI 10:29:11 !560 $ poe bad_life -procs 8
Task 0 complete.
Task 2 complete.
Task 4 complete.
Task 6 complete.
ERROR: 0031-250  task 3: Segmentation fault
ERROR: 0031-250  task 7: Segmentation fault
ERROR: 0031-250  task 2: Terminated
ERROR: 0031-250  task 0: Terminated
ERROR: 0031-250  task 4: Terminated
ERROR: 0031-250  task 6: Terminated
ERROR: 0031-250  task 1: Terminated
ERROR: 0031-250  task 5: Terminated
gustav@sp20:../MPI 10:30:05 !561 $

Debug

gustav@sp20:../MPI 10:30:05 !561 $ ls -FCsd core*
   0 coredir.3/     0 coredir.7/
gustav@sp20:../MPI 10:30:55 !562 $ ls -FCs coredir.3
total 5
   5 core
gustav@sp20:../MPI 10:31:20 !563 $ dbx bad_life coredir.3/core
Type 'help' for help.
reading symbolic information ...
[using memory image in coredir.3/core]

Segmentation fault in moveeq.memcpy [/usr/lpp/ppe.poe/lib/ip/libmpci.a] at 0xd0b49d9c
0xd0b49d9c (memcpy+0x11c) 7ca01d2a       stsx   r5,r0,r3
(dbx) where
moveeq.memcpy() at 0xd0b49d9c
cpfromdev() at 0xd0b48b40
readdatafrompipe() at 0xd0b46aac
readfrompipe() at 0xd0b4d254
kickpipes() at 0xd0b4833c
mpci_recv() at 0xd0b54298
_mpi_recv(??, ??, ??, ??, ??, ??, ??) at 0xd0a15fc8
MPI__Recv(??, ??, ??, ??, ??, ??, ??) at 0xd0a14888
main(argc = 1, argv = 0x2ff228f0), line 23 in "bad_life.c"
(dbx) func main
(dbx) list 23
   23       MPI_Recv(recv_message, 1, MPI_CHAR, taskid-1, 0, MPI_COMM_WORLD, &stat);
(dbx) print recv_message
"recv_message" is not active
(dbx) print stat
(source = 0, tag = 0, error = 1, val1 = 0, val2 = 0, val3 = -559038737, val4 = -559038737, val5 = -559038737) 
(dbx) print taskid
3 
(dbx) quit
gustav@sp20:../MPI 10:33:54 !564 $

Run under the parallel debugger pdbx:

gustav@sp20:../MPI 10:38:59 !568 $ pdbx bad_life -procs 8
pdbx Version 2, Release 3 -- Oct 13 1998 21:45:00

  2:reading symbolic information ...
  0:reading symbolic information ...
  1:reading symbolic information ...
  3:reading symbolic information ...
  4:reading symbolic information ...
  5:reading symbolic information ...
  7:reading symbolic information ...
  6:reading symbolic information ...
  0:[1] stopped in main at line 10
  0:   10     MPI_Init (&argc, &argv);
  2:[1] stopped in main at line 10
  2:   10     MPI_Init (&argc, &argv);
  3:[1] stopped in main at line 10
  3:   10     MPI_Init (&argc, &argv);
  4:[1] stopped in main at line 10
  4:   10     MPI_Init (&argc, &argv);
  7:[1] stopped in main at line 10
  7:   10     MPI_Init (&argc, &argv);
  1:[1] stopped in main at line 10
  1:   10     MPI_Init (&argc, &argv);
  5:[1] stopped in main at line 10
  5:   10     MPI_Init (&argc, &argv);
  6:[1] stopped in main at line 10
  6:   10     MPI_Init (&argc, &argv);
0031-504  Partition loaded ...

pdbx(all) cont
  0:Task 0 complete.
  2:Task 2 complete.
  6:Task 6 complete.
  4:Task 4 complete.
  5:
  5:Segmentation fault in @moveeq._moveeq [/usr/lpp/ppe.poe/lib/ip/libmpci.a] at 0xd0b42d9c
  5:0xd0b42d9c (memmove+0x11c) 7ca01d2a       stsx   r5,r0,r3
  1:
  1:Segmentation fault in @moveeq._moveeq [/usr/lpp/ppe.poe/lib/ip/libmpci.a] at 0xd0b49d9c
  1:0xd0b49d9c (memmove+0x11c) 7ca01d2a       stsx   r5,r0,r3
  7:
  7:Segmentation fault in @moveeq._moveeq [/usr/lpp/ppe.poe/lib/ip/libmpci.a] at 0xd06f4d9c
  7:0xd06f4d9c (memmove+0x11c) 7ca01d2a       stsx   r5,r0,r3
  3:
  3:Segmentation fault in @moveeq._moveeq [/usr/lpp/ppe.poe/lib/ip/libmpci.a] at 0xd0b49d9c
  3:0xd0b49d9c (memmove+0x11c) 7ca01d2a       stsx   r5,r0,r3
^C
pdbx-subset(all) on 7

pdbx(7) where
  7:@moveeq.memmove() at 0xd06f4d9c
  7:cpfromdev() at 0xd06f3b40
  7:readdatafrompipe() at 0xd06f1aac
  7:readfrompipe() at 0xd06f8254
  7:kickpipes() at 0xd06f333c
  7:mpci_recv() at 0xd06ff298
  7:_mpi_recv(??, ??, ??, ??, ??, ??, ??) at 0xd0653fc8
  7:MPI__Recv(??, ??, ??, ??, ??, ??, ??) at 0xd0652888
  7:main(argc = 1, argv = 0x2ff228c8), line 23 in "bad_life.c"

pdbx(7) func main

pdbx(7) list 23
  7:   23       MPI_Recv(recv_message, 1, MPI_CHAR, taskid-1, 0, MPI_COMM_WORLD, &stat);

pdbx(7) print recv_message
  7:"recv_message" is not active

pdbx(7) quit
gustav@sp20:../MPI 10:41:43 !569 $



 

Zdzislaw Meglicki
2001-02-26