[Oar-commits] OAR branch 2.5 updated. 2.5.3+rc4-16-gd5c8c47

Nicolas Capit capitn at ff-scm-v4-prod.irisa.fr
Sat Apr 27 10:49:34 CEST 2013


The branch, 2.5 has been updated
       via  d5c8c479d7ecfb542b924c895c5b6d71909a5486 (commit)
      from  74757eb9d386149cf5535d8ab37c4dc0e08013b4 (commit)


- Log -----------------------------------------------------------------
commit d5c8c47
Author: capitn <nicolas.capit at imag.fr>
Date:   Sat Apr 27 10:47:24 2013 +0200

    [oardel] --force-terminate-finishing-job option
    
    Add the oardel option --force-terminate-finishing-job: to use when a job is
    stuck in the Finishing state
---
 CHANGELOG                                 |    2 +
 docs/documentation/FAQ-ADMIN              |    9 ++++++-
 sources/core/common-libs/lib/OAR/IO.pm    |   34 +++++++++++++++++++++++-
 sources/core/man/man1/oardel.pod          |    9 ++++++-
 sources/core/modules/node_change_state.pl |   32 +++++++++++------------
 sources/core/qfunctions/oardel            |   39 ++++++++++++++++++++++++++++-
 6 files changed, 103 insertions(+), 22 deletions(-)

diff --git a/CHANGELOG b/CHANGELOG
index 6932f1c..33fee50 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -39,6 +39,8 @@ version 2.5.3:
     
     So, even if all the ressources have "maintenance='on'", the new jobs will be
     accepted but not scheduled now.
+  - Add the oardel option --force-terminate-finishing-job: to use when a job is
+    stuck in the Finishing state
 
 version 2.5.2:
 --------------
diff --git a/docs/documentation/FAQ-ADMIN b/docs/documentation/FAQ-ADMIN
index 8100609..f0b7129 100644
--- a/docs/documentation/FAQ-ADMIN
+++ b/docs/documentation/FAQ-ADMIN
@@ -240,7 +240,14 @@ Notes:
 
 A job remains in the "Finishing" state, what can I do?
 ------------------------------------------------------
-If you have waited more than a couple of minutes (10mn for example) then
+From the 2.5.3 version, you can use the oardel command to force the change of
+the job state:
+::
+
+    oardel --force-terminate-finishing-job 42
+
+If you have an older version then you can do the same thing manually. If you
+have waited more than a couple of minutes (30mn for example) then
 something wrong occurred (frontal has crashed, out of memory, ...).
 
 So you are able to turn manually a job into the "Error" state by typing with the root user (example with a bash shell)::
diff --git a/sources/core/common-libs/lib/OAR/IO.pm b/sources/core/common-libs/lib/OAR/IO.pm
index 8acf571..81588be 100644
--- a/sources/core/common-libs/lib/OAR/IO.pm
+++ b/sources/core/common-libs/lib/OAR/IO.pm
@@ -858,14 +858,13 @@ sub get_job_host_log($$) {
     my $dbh = shift;
     my $moldablejobid = shift;
     
-    my $sth = $dbh->prepare("   SELECT resources.network_address, resources.resource_id
+    my $sth = $dbh->prepare("   SELECT DISTINCT(resources.network_address)
                                 FROM assigned_resources, resources
                                 WHERE
                                     assigned_resources.moldable_job_id = $moldablejobid AND
                                     resources.resource_id = assigned_resources.resource_id AND
                                     resources.network_address != \'\' AND
                                     resources.type = \'default\'
-                                ORDER BY resources.resource_id ASC
                             ");
     $sth->execute();
     my @res = ();
@@ -2521,6 +2520,37 @@ sub log_job($$){
     }
 }
 
+# Get the amount of time in the defined state for a job
+# args : base, job_id, job_state
+# returns a number of seconds
+sub get_job_duration_in_state($$$){
+    my $dbh = shift;
+    my $job_id = shift;
+    my $job_state = shift;
+
+    my $current_time = get_date($dbh);
+
+    my $sth = $dbh->prepare("   SELECT date_start, date_stop
+                                FROM job_state_logs
+                                WHERE
+                                    job_id = $job_id AND
+                                    job_state = \'$job_state\'
+                            ");
+    $sth->execute();
+    my $sum = 0;
+    while (my $ref = $sth->fetchrow_hashref()){
+        my $tmp_sum = 0;
+        if ($ref->{date_stop} == 0){
+            $tmp_sum = $current_time - $ref->{date_start};
+        }else{
+            $tmp_sum += $ref->{date_stop} - $ref->{date_start};
+        }
+        $sum += $tmp_sum if ($tmp_sum > 0);
+    }
+    $sth->finish();
+
+    return($sum);
+}
 
 # archive_some_moldable_job_nodes
 # sets the index fields to LOG in the table assigned_resources
diff --git a/sources/core/man/man1/oardel.pod b/sources/core/man/man1/oardel.pod
index 823788c..edf8351 100644
--- a/sources/core/man/man1/oardel.pod
+++ b/sources/core/man/man1/oardel.pod
@@ -4,7 +4,7 @@ oardel - delete or checkpoint job(s).
 
 =head1 SYNOPSIS
 
-B<oardel> [-c|-b][--array][job_ids][-h][-V]
+B<oardel> [-c|-b|-s|--force-terminate-finishing-job][--array][job_ids][-h][-V]
 
 =head1 DESCRIPTION
 
@@ -34,6 +34,13 @@ Delete/checkpoint array job(s) passed as parameter (all the sub-jobs of the give
 
 Delete/checkpoint jobs which respond to the SQL where clause on the table jobs (ex: "project = 'p1'").
 
+=item B<--force-terminate-finishing-job>
+
+Force to switch the jobs into the Error state when they are stuck in Finishing.
+
+Note: the nodes where the jobs were executing will be turned into Suspected because something wrong occured if this option has to be used.
+
+
 =item B<-V, --version>
 
 Print OAR version number.
diff --git a/sources/core/modules/node_change_state.pl b/sources/core/modules/node_change_state.pl
index 489c295..7cf8008 100755
--- a/sources/core/modules/node_change_state.pl
+++ b/sources/core/modules/node_change_state.pl
@@ -55,7 +55,10 @@ foreach my $i (@events_to_check){
     ####################################################
     if ($i->{type} eq "SWITCH_INTO_TERMINATE_STATE"){
         OAR::IO::set_job_state($base,$i->{job_id},"Terminated");
-    }elsif ($i->{type} eq "SWITCH_INTO_ERROR_STATE"){
+    }elsif (
+            ($i->{type} eq "SWITCH_INTO_ERROR_STATE") ||
+            ($i->{type} eq "FORCE_TERMINATE_FINISHING_JOB")
+           ){
         OAR::IO::set_job_state($base,$i->{job_id},"Error");
     }
 
@@ -117,7 +120,8 @@ foreach my $i (@events_to_check){
         ($i->{type} eq "SSH_TRANSFER_TIMEOUT") ||
         ($i->{type} eq "BAD_HASHTABLE_DUMP") ||
         ($i->{type} eq "LAUNCHING_OAREXEC_TIMEOUT") ||
-        ($i->{type} eq "EXIT_VALUE_OAREXEC")
+        ($i->{type} eq "EXIT_VALUE_OAREXEC") ||
+        ($i->{type} eq "FORCE_TERMINATE_FINISHING_JOB")
        ){
         my @hosts;
         my $finaud_tag = "NO";
@@ -138,7 +142,8 @@ foreach my $i (@events_to_check){
                 ($i->{type} ne "PROLOGUE_ERROR") &&
                 ($i->{type} ne "EPILOGUE_ERROR") &&
                 ($i->{type} ne "CPUSET_ERROR") &&
-                ($i->{type} ne "CPUSET_CLEAN_ERROR")
+                ($i->{type} ne "CPUSET_CLEAN_ERROR") &&
+                ($i->{type} ne "FORCE_TERMINATE_FINISHING_JOB")
             ){
                 @hosts = ($hosts[0]);
             }else{
@@ -146,7 +151,6 @@ foreach my $i (@events_to_check){
                 # then the CPUSET clean will tell us which nodes are dead
                 my $cpuset_field = get_conf("JOB_RESOURCE_MANAGER_PROPERTY_DB_FIELD");
                 if (defined($cpuset_field) && ($i->{type} eq "EXTERMINATE_JOB")){
-                    #@hosts = ($hosts[0]);
                     @hosts = ();
                 }
             }
@@ -158,20 +162,14 @@ foreach my $i (@events_to_check){
             foreach my $j (@hosts){
                 next if ((defined($already_treated_host{$j}) or ($j eq "")));
                 $already_treated_host{$j} = 1;
-                #my @free_resources = OAR::IO::get_current_free_resources_of_node($base, $j);
-                #if ($#free_resources >= 0){
-                #    foreach my $r (@free_resources){
-                        #OAR::IO::set_resource_state($base,$r,"Suspected",$finaud_tag);
-                        OAR::IO::set_node_state($base,$j,"Suspected",$finaud_tag);
-                        foreach my $r (OAR::IO::get_all_resources_on_node($base,$j)){
-                          push(@resources_to_heal,"$r $j");
-                        }
-                        $Exit_code = 1;
-                #    }
-                #}
+                OAR::IO::set_node_state($base,$j,"Suspected",$finaud_tag);
+                foreach my $r (OAR::IO::get_all_resources_on_node($base,$j)){
+                    push(@resources_to_heal,"$r $j");
+                }
+                $Exit_code = 1;
             }
-            oar_warn("[NodeChangeState] error ($i->{type}) on the nodes:\n\n at hosts\n\nSo we are suspecting corresponding free resources\n");
-            send_log_by_email("Suspecting nodes","[NodeChangeState] error ($i->{type}) on the nodes:\n\n at hosts\n\nSo we are suspecting corresponding free resources\n");
+            oar_warn("[NodeChangeState] error ($i->{type}) on the nodes:\n\n at hosts\n\nSo we are suspecting them\n");
+            send_log_by_email("Suspecting nodes","[NodeChangeState] error ($i->{type}) on the nodes:\n\n at hosts\n\nSo we are suspecting them\n");
         }
     }
     
diff --git a/sources/core/qfunctions/oardel b/sources/core/qfunctions/oardel
index 9ec2500..b0db939 100755
--- a/sources/core/qfunctions/oardel
+++ b/sources/core/qfunctions/oardel
@@ -21,7 +21,7 @@ my $exitValue = 0;
 # Display command help
 sub usage {
     print <<EOS;
-Usage: $0 [-c|-b|-s][--array][job_ids][-h][-V]
+Usage: $0 [-c|-b|-s|--force-terminate-finishing-job][--array][job_ids][-h][-V]
 Delete or send checkpoint signal to jobs.
 Options:
   -h, --help              show this help screen
@@ -34,6 +34,9 @@ Options:
       --sql               delete/checkpoint jobs which respond to the SQL
                           where clause on the table jobs
                           (ex: "project = 'p1'")
+      --force-terminate-finishing-job
+                          force the job_ids in Finishing state to turn into
+                          Terminated (must be used at a last resort)
   -V, --version           print OAR version number
 EOS
     exit(1);
@@ -74,6 +77,7 @@ my $Version;
 my $Sql_property;
 my $Besteffort;
 my $array;
+my $Force_terminate_finishing_job;
 
 GetOptions ("checkpoint|c" => \$Checkpoint,
             "signal|s:s" => \$signal,
@@ -81,6 +85,7 @@ GetOptions ("checkpoint|c" => \$Checkpoint,
             "help|h" => \$sos,
             "array" => \$array,
             "sql=s"   => \$Sql_property,
+            "force-terminate-finishing-job" => \$Force_terminate_finishing_job,
             "version|V" => \$Version
            );
 
@@ -253,6 +258,38 @@ if (defined($Checkpoint)){
             }
         }
     }
+}elsif (defined($Force_terminate_finishing_job)){
+    my $lusr= $ENV{OARDO_USER};
+    if (($lusr ne "oar") and ($lusr ne "root")){
+        $exitValue = 8;
+        warn("You must be oar or root to use the --force-terminate-finishing-job option\n");
+    }else{
+        my $max_duration = 2 * OAR::Tools::get_taktuk_timeout() + OAR::Conf::get_conf_with_default_param("SERVER_PROLOGUE_EPILOGUE_TIMEOUT",0);
+        foreach my $j (@job_ids){
+            print("Force the termination of the job = $j ...");
+            if (OAR::IO::get_job_state($base, $j) eq "Finishing"){
+                my $duration = OAR::IO::get_job_duration_in_state($base, $j, "Finishing");
+                if ($duration > $max_duration){
+                    OAR::IO::add_new_event($base,"FORCE_TERMINATE_FINISHING_JOB",$j,"[oardel] Force to Terminate the job $j which is in Finishing state");
+                    print("REGISTERED.\n");
+                }else{
+                    $exitValue = 11;
+                    print("ERROR.\n");
+                    warn("The job $j is not in the Finishing state for more than ".$max_duration."s (".$duration."s).\n");
+                }
+            }else{
+                $exitValue = 10;
+                print("ERROR.\n");
+                warn("The job $j is not in the Finishing state.\n");
+            }
+        }
+        OAR::IO::disconnect($base);
+        my $strError = OAR::Tools::notify_tcp_socket($remote_host,$remote_port,"ChState");
+        if (defined($strError)){
+            warn("$strError\n");
+            $exitValue = 2;
+        }
+    }
 }elsif (defined($Besteffort)){
     my $lusr= $ENV{OARDO_USER};
     if (($lusr ne "oar") and ($lusr ne "root")){
-----------------------------------------------------------------------

Summary of changes:
 CHANGELOG                                 |    2 +
 docs/documentation/FAQ-ADMIN              |    9 ++++++-
 sources/core/common-libs/lib/OAR/IO.pm    |   34 +++++++++++++++++++++++-
 sources/core/man/man1/oardel.pod          |    9 ++++++-
 sources/core/modules/node_change_state.pl |   32 +++++++++++------------
 sources/core/qfunctions/oardel            |   39 ++++++++++++++++++++++++++++-
 6 files changed, 103 insertions(+), 22 deletions(-)


hooks/post-receive
-- 
OAR



More information about the Oar-commits mailing list