[Oar-commits] OAR branch 2.5 updated. 2.5.3+rc4-9-g69d24fa

Nicolas Capit capitn at ff-scm-v4-prod.irisa.fr
Mon Apr 8 23:09:23 CEST 2013


The branch, 2.5 has been updated
       via  69d24faab70712713cb60007f30bdc64eca7d92c (commit)
      from  a7d3d8aea4a1e9280fcf69ed5e2ef1113d6546b0 (commit)


- Log -----------------------------------------------------------------
commit 69d24fa
Author: capitn <nicolas.capit at imag.fr>
Date:   Mon Apr 8 23:05:38 2013 +0200

    [oarexec] Add comments in stderr job files
    
    Add comments in user job STDERR files to know if a job was killed or
    checkpointed. For example, the user can see:
      ## OAR [2013-04-08 23:00:22] Job 3663 KILLED
      ## OAR [2013-04-08 23:03:16] Job 3664 SIGNALED with USR2
      ## OAR [2013-04-08 23:07:17] Job 3666 CHECKPOINTED with signal 12
---
 CHANGELOG                                 |    2 +
 sources/core/common-libs/lib/OAR/Tools.pm |    3 +
 sources/core/modules/runner/oarexec       |   60 ++++++++++++++++++-----------
 3 files changed, 42 insertions(+), 23 deletions(-)

diff --git a/CHANGELOG b/CHANGELOG
index 8db0008..0bbef9b 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -30,6 +30,8 @@ version 2.5.3:
     the job attributes: queue, project, types, user
     (available with the scheduler
     "oar_sched_gantt_with_timesharing_and_fairsharing_and_quotas")
+  - Add comments in user job STDERR files to know if a job was killed or
+    checkpointed
 
 version 2.5.2:
 --------------
diff --git a/sources/core/common-libs/lib/OAR/Tools.pm b/sources/core/common-libs/lib/OAR/Tools.pm
index ceef6ea..13aa680 100644
--- a/sources/core/common-libs/lib/OAR/Tools.pm
+++ b/sources/core/common-libs/lib/OAR/Tools.pm
@@ -555,6 +555,9 @@ if($pid == 0){
     warn("[OAR] Cannot find @{$cmd_exec}\n");
     exit(-1);
 }
+select(OLDSTDOUT);
+$| = 1;
+print(OLDSTDOUT "USER_CMD_PID $pid\n");
 waitpid($pid,0);
 
 print(OLDSTDOUT "EXIT_CODE $?");
diff --git a/sources/core/modules/runner/oarexec b/sources/core/modules/runner/oarexec
index 5cc5fe4..bdf0dcc 100755
--- a/sources/core/modules/runner/oarexec
+++ b/sources/core/modules/runner/oarexec
@@ -34,7 +34,7 @@ BEGIN{
 
 use strict;
 use Sys::Hostname;
-use POSIX qw(:signal_h :errno_h :sys_wait_h);
+use POSIX qw(:signal_h :errno_h :sys_wait_h strftime);
 
 my $Old_umask = sprintf("%lo",umask());
 umask(oct("022"));
@@ -176,12 +176,14 @@ sub kill_children($){
     system({"oardodo"} "oardodo","kill","-9",@{$children});
 }
 
-sub send_kill_signal_to_myself(){
+sub send_kill_signal_to_myself($){
+    my $signal_to_send = shift;
+
     my $father = $$;
     my $pid=fork;
     if($pid==0){
         sleep(5);
-        kill('SIGUSR2', $father);
+        kill($signal_to_send, $father);
         exit();
     }
 }
@@ -360,7 +362,7 @@ sub user_defined_signal_handler {
     my $signal;
     open(FILE, OAR::Tools::get_oar_user_signal_file_name($Job_id));
     while (<FILE>) {
-	$signal = $_;
+        $signal = $_;
     }
     close(FILE);
     print(pipe_kill_write "SIGNAL_$signal\n");
@@ -499,6 +501,7 @@ my $Stop_signal = 0;
 my $Checkpoint_signal = 0;
 my $user_signal = 0;
 my $stop_loop = 0;
+my $user_cmd_pid = -1;
 # wait end of the child process or KILL notification
 while (($line_read != $pid) and ($Stop_signal == 0) and ($stop_loop == 0)){
     print("[oarexec $Job_id] wait end of child process or kill notification\n");
@@ -515,6 +518,9 @@ while (($line_read != $pid) and ($Stop_signal == 0) and ($stop_loop == 0)){
             if ($line_read =~ /^EXIT_CODE\s(\d+)$/){
                 $Exit_script_code = $1;
                 print("OAREXEC_SCRIPT_EXIT_VALUE $Exit_script_code\n");
+            }elsif ($line_read =~ /^USER_CMD_PID\s(\d+)$/){
+                $user_cmd_pid = $1;
+                print("[oarexec $Job_id] User command PID = $user_cmd_pid\n");
             }
         }
     }elsif (vec($rin_tmp, fileno(pipe_child_read), 1)){
@@ -536,6 +542,10 @@ while (($line_read != $pid) and ($Stop_signal == 0) and ($stop_loop == 0)){
                 $stop_loop = 1;
             }else{
                 kill_children($pid);
+                if (defined($Job->{stderr_file}) and ($Job->{stderr_file} ne "")){
+                    my $date = strftime("%F %T",localtime());
+                    system({"bash"} "bash","-c","cd $Job->{launching_directory} && echo '## OAR [$date] Job $Job_id KILLED' | OARDO_BECOME_USER=$Job->{job_user} oardodo tee -a '$Job->{stderr_file}'");
+                }
             }
             $kill_myself = 1;
         }elsif ($line_read eq "STOP"){
@@ -546,31 +556,35 @@ while (($line_read != $pid) and ($Stop_signal == 0) and ($stop_loop == 0)){
                 print("[oarexec $Job_id] Receive USR1 signal so someone wants to finish this job but it is not an INTERACTIVE one\n");
             }
         }elsif ($line_read eq "CHECKPOINT"){
-            #We must send SIGUSR2 to the child of $pid
-            my ($tmp_hash, $tmp_cmd_hash) = OAR::Tools::get_all_process_children();
-            my $pid_to_send_kill = @{$tmp_hash->{$pid}}[0];
-            if (defined($pid_to_send_kill)){
-                print("[oarexec $Job_id] Checkpoint received, send signal $Job->{checkpoint_signal} to the pid $pid_to_send_kill\n");
-                system({"oardodo"} "oardodo","kill","-s",$Job->{checkpoint_signal},$pid_to_send_kill);
+            #We must send the signal defined by oarsub to the child of $pid
+            if ($user_cmd_pid >= 0){
+                print("[oarexec $Job_id] Checkpoint received, send signal $Job->{checkpoint_signal} to the pid $user_cmd_pid\n");
+                system({"oardodo"} "oardodo","kill","-s",$Job->{checkpoint_signal},$user_cmd_pid);
+                if (defined($Job->{stderr_file}) and ($Job->{stderr_file} ne "")){
+                    my $date = strftime("%F %T",localtime());
+                    system({"bash"} "bash","-c","cd $Job->{launching_directory} && echo '## OAR [$date] Job $Job_id CHECKPOINTED with signal $Job->{checkpoint_signal}' | OARDO_BECOME_USER=$Job->{job_user} oardodo tee -a '$Job->{stderr_file}'");
+                }
             }else{
-                print("[oarexec $Job_id] Cannot find pid of user process??? I will retry in 5 seconds\n");
-                send_kill_signal_to_myself();
+                print("[oarexec $Job_id] Cannot find pid of user process. I will retry in 5 seconds\n");
+                send_kill_signal_to_myself('SIGUSR2');
             }
-            $Checkpoint_signal = 1;	  
+            $Checkpoint_signal = 1;
         }elsif ($line_read =~ m/^SIGNAL_(.*)/){
-	    my $signal = $1;
-	    #We must send $signal to the child of $pid
-            my ($tmp_hash, $tmp_cmd_hash) = OAR::Tools::get_all_process_children();
-            my $pid_to_send_kill = @{$tmp_hash->{$pid}}[0];
-            if (defined($pid_to_send_kill)){
-                print("[oarexec $Job_id] Signal received, send signal $signal to the pid $pid_to_send_kill\n");
-                system({"oardodo"} "oardodo","kill","-s",$signal,$pid_to_send_kill);
+            my $signal = $1;
+            #We must send $signal to the child of $pid
+            if ($user_cmd_pid >= 0){
+                print("[oarexec $Job_id] Signal URG received, send signal $signal to the pid $user_cmd_pid\n");
+                system({"oardodo"} "oardodo","kill","-s",$signal,$user_cmd_pid);
+                if (defined($Job->{stderr_file}) and ($Job->{stderr_file} ne "")){
+                    my $date = strftime("%F %T",localtime());
+                    system({"bash"} "bash","-c","cd $Job->{launching_directory} && echo '## OAR [$date] Job $Job_id SIGNALED with $signal' | OARDO_BECOME_USER=$Job->{job_user} oardodo tee -a '$Job->{stderr_file}'");
+                }
             }else{
-                print("[oarexec $Job_id] Cannot find pid of user process??? I will retry in 5 seconds\n");
-                send_kill_signal_to_myself();
+                print("[oarexec $Job_id] Cannot find pid of user process. I will retry in 5 seconds\n");
+                send_kill_signal_to_myself('URG');
             }
             $user_signal = 1;
-	}
+        }
     }    
 }
 
-----------------------------------------------------------------------

Summary of changes:
 CHANGELOG                                 |    2 +
 sources/core/common-libs/lib/OAR/Tools.pm |    3 +
 sources/core/modules/runner/oarexec       |   60 ++++++++++++++++++-----------
 3 files changed, 42 insertions(+), 23 deletions(-)


hooks/post-receive
-- 
OAR



More information about the Oar-commits mailing list