diff --git a/turbine/code/export/job.swift b/turbine/code/export/job.swift index 0e97a94e3..5756d9a21 100644 --- a/turbine/code/export/job.swift +++ b/turbine/code/export/job.swift @@ -3,6 +3,6 @@ // Various system-level configurations for application jobs. @par @dispatch=WORKER -(int status) job_srun(int cores_per_job, int procs_per_job, - string cmd_line[]) +(int status) job_srun(int cores_per_node, int cores_per_job, int procs_per_job, + boolean bind, string cmd_line[]) "turbine" "0.0" "job_srun_tcl"; diff --git a/turbine/code/lib/functions.tcl b/turbine/code/lib/functions.tcl index d77f1e588..a9623768a 100644 --- a/turbine/code/lib/functions.tcl +++ b/turbine/code/lib/functions.tcl @@ -592,9 +592,37 @@ namespace eval turbine { } return $result } + + proc contig { start count { step 1 } } { + set result [ list ] + set value $start + for { set i 0 } { $i < $count } { incr i } { + lappend result $value + incr value $step + } + return $result + } + + # Break list L into count equal-size chunks (of size s) + proc fragment { L count } { + set result [ list ] + set n [ llength $L ] + set s [ expr $n / $count ] + set index 0 + for { set c 0 } { $c < $count } { incr c } { + set chunk [ list ] + for { set i 0 } { $i < $s } { incr i } { + lappend chunk [ lindex $L [ expr $index + $i ] ] + } + lappend result $chunk + incr index $i + } + + return $result + } } # Local Variables: # mode: tcl -# tcl-indent-level: 4 +# tcl-indent-level: 2 # End: diff --git a/turbine/code/lib/job.tcl b/turbine/code/lib/job.tcl index 1a2b67dfc..dbbdd9588 100644 --- a/turbine/code/lib/job.tcl +++ b/turbine/code/lib/job.tcl @@ -6,17 +6,22 @@ namespace eval turbine { proc job_srun_tcl { outputs inputs } { set exit_code [ lindex $outputs 0 ] - set cpj [ lindex $inputs 0 ] - set ppj [ lindex $inputs 1 ] - set cmd_line [ lindex $inputs 2 ] - rule $inputs "turbine::job_srun_tcl_body $exit_code $cpj $ppj $cmd_line" \ + set cpn [ lindex $inputs 0 ] + set cpj [ lindex $inputs 1 ] + set ppj [ lindex $inputs 2 ] + set bind [ lindex $inputs 3 ] + set cmd_line [ lindex $inputs 4 ] + rule $inputs \ + "turbine::job_srun_tcl_body $exit_code $cpn $cpj $ppj $bind $cmd_line" \ type $turbine::WORK } - proc job_srun_tcl_body { exit_code cpj ppj cmd_line } { + proc job_srun_tcl_body { exit_code cpn cpj ppj bind cmd_line } { # Retrieve data (decr?) - set cpj_value [ retrieve_integer $cpj ] - set ppj_value [ retrieve_integer $ppj ] + set cpn_value [ retrieve_integer $cpn ] + set cpj_value [ retrieve_integer $cpj ] + set ppj_value [ retrieve_integer $ppj ] + set bind_value [ retrieve_integer $bind ] # Unpack command line set D [ adlb::enumerate $cmd_line dict all 0 ] set cmd_value [ list ] @@ -25,27 +30,89 @@ namespace eval turbine { lappend cmd_value [ dict get $D $k ] } # Run the user code - set exit_code_value [ job_srun_impl $cpj_value $ppj_value $cmd_value ] + set exit_code_value \ + [ job_srun_impl $cpn_value $cpj_value $ppj_value $bind_value $cmd_value ] # Store result store_integer $exit_code $exit_code_value } - proc job_srun_impl { cpj ppj cmd } { + proc job_srun_impl { cpn cpj ppj bind cmd } { + # Setup and run the job. Return a unix exit code. + global env + puts "turbine: srun: job_srun ..." + + if $bind { + set cpu_bind [ bind_mask_cpu $cpn $cpj $ppj ] + } else { + set cpu_bind "" + } + + puts "turbine: srun: job_srun -n $ppj -N 1 $cpu_bind $cmd" + puts "turbine: srun: in PWD: $env(PWD)" try { - puts "turbine: srun: exec: srun -n $ppj $cmd" - set fp [ open "|srun -n $ppj $cmd" "r" ] - show fp + # Run the user job! (with pipe to capture output) + set fp [ open "|srun -n $ppj -N 1 $cpu_bind $cmd 2>@1" "r" ] while { [ gets $fp line ] >= 0 } { puts "srun: $line" } close $fp } on error e { - puts "turbine: srun failed!" - puts "turbine: srun error message begin:" - puts $e - puts "turbine: srun error message end." + job_srun_error $e return 1 } return 0 } + + proc bind_mask_cpu { cpn cpj ppj } { + # Set up the SLURM cpu binding + global env + set cpu_bind "--cpu-bind=verbose,mask_cpu:" + set offset $env(ADLB_RANK_OFFSET) + set ppn $env(PPN) + # puts "offset=$offset ppn=$ppn cpn=$cpn" + show offset ppn cpn cpj ppj + + set L [ list ] + set start [ expr $offset * $cpj ] + set spacing [ expr $cpj / $ppj ] + set cpj_max [ expr $cpn / $ppn ] + show cpj_max + set start [ expr $cpj_max * $offset ] + # set S1 [ contig $start $cpj_max ] + # show S1 + set step [ expr $cpj_max / $cpj ] + set S2 [ contig $start $cpj $step ] + show step S2 + set K [ fragment $S2 $ppj ] + show K + + # set cpu_ids [ join $L "," ] + # append cpu_bind $cpu_ids + set masks [ list ] + foreach chunk $K { + set mask [ list2mask $chunk ] + show mask + lappend masks $mask + } + show masks + append cpu_bind [ join $masks "," ] + return $cpu_bind + } + + proc job_srun_error { e } { + puts "turbine: srun failed!" + puts "turbine: srun error message begin:" + puts $e + puts "turbine: srun error message end." + } + + proc list2mask { L } { + set A 0 + foreach i $L { + incr A [ expr 2 ** $i ] + } + puts $A + # printf "bitmap: %b" $A + return [ format "0x%X" $A ] + } }