next up previous contents
Next: Bibliography Up: The Effect of Deceptive Previous: 9 Conclusion   Contents

Subsections

Appendix: Detailed NWCS pseudocode

This appendix summarizes the implementation of the NWCS waiting mechanism, and separate heuristics for the SPTF (and C-LOOK) and Stride schedulers. This should facilitate easy implementation and deployment of NWCS in current operating systems. This does not include pseudocode for measuring disk characteristics and building an approximate positioning time calculator. For even greater detail, please refer to our prototype implementation. The source code is made publicly available on the project webpage, hosted at [ID01a].




I.  Generic NWCS scheduling framework

1.0

    
    // sched_{enqueue,dequeue,finish,collect_stats,evaluate}
    //        are the scheduler-specific functions described later

    enum STATE { WAIT = -1, IDLE = 0, BUSY = 1,2,3,... } state = IDLE;
    integer pending = 0;
    boolean expired = false;

    // these four functions should set the higher interrupt priority level
    // of the timer (splclock or splhigh), rather than of disk I/O (splbio)
    
    issue(new):
        // gets invoked by upper layers of the kernel
        // state == IDLE or BUSY or WAIT

        if (new.is_read_request) sched_collect_stats(new);
        sched_enqueue(new);
        pending++;

        if (new.is_read_request && (new.is_sync_request || state == WAIT))
            new.proc.blocked = true; // little hope in waiting for this process

        if (state == IDLE || state == WAIT) schedule();


    schedule():
        // internal function. sometimes takes a request from the pool and
        // services it; otherwise just returns with the timeout activated.
        // called with pending > 0

        static request last, next;      // retain values across calls

        if (expired)
        |   // state == WAIT
        |   expired = false;
        |   next = sched_choose(); // need this for SCSI only
        else if (last.proc.blocked)
        |   // state == WAIT
        |   next = sched_choose();
        else
        |   // state == IDLE or WAIT, or BUSY if SCSI
        |   next = sched_choose()
        |   duration = next.is_read_request ?
        |         sched_evaluate(last, next) : 0;
        |   if (duration > 0)
        |   |   if (state == IDLE)
        |   |       state = WAIT;
        |   |       set_timeout(duration);
        |   |   else
        |   |       // do nothing; in particular, don't retrigger timeout
        |_  |_  return; // "next" contains the chosen candidate

        next = sched_choose();          // probably need this for SCSI
        sched_dequeue(next);
        pending--;
        if (state == WAIT) { cancel_timeout(); state = IDLE; }
        state++;                        // idle->busy, busy->busier
        last = next; next = NULL;
        PERFORM_DISK_IO(last);          // run request on disk


    timeout_expire():
        // gets called by timeout code, in interrupt context
        // state == WAIT, pending > 0, next != NULL
        expired = true;
        schedule();


    finish(req):
        // gets called by disk driver when current finishes execution
        // state == BUSY
        state--;                        // busy:1->idle, busier->busy
        sched_finish(req);
        req.proc.prev_finish_time = NOW;
        if (pending) schedule();
1.5

II.  NWCS heuristic for SPTF

1.0
    sched_collect_stats(new):

        // every process has:
        //    buckets:[0..N] with N = 30 (i.e. least count = 500us)
        //    expected_thinktime, expected_most_thinktime
        //    expected_seek_distance, expected_seek_direction
        //    expected_positioning_time

        if (req \in scheduler_queue
            && req.is_read_request && req.proc == new.proc)
              thinktime = 0
        else
              thinktime = min(15ms, NOW - new.proc.prev_finish_time);
                  // the process issuing "new" computes for some time
                  // between its finish time for previous request and NOW

        loop:i:(0..N) new.proc.bucket[i] *= 0.9         // decay all buckets
        new.proc.bucket[(thinktime/15ms) * N] += 1;     // increment this bucket
        new.proc.expected_thinktime      = median(new.proc.buckets)
        new.proc.expected_most_thinktime = 95percentile(new.proc.buckets)


        seekdist      = ABS(new.location - new.proc.prev_location)
        seekdirection = (new.location > new.proc.prev_location) ? 1 : 0;
        new.proc.prev_location = new.location; // update prev_location

        new.proc.expected_seek_distance *= 0.74
        new.proc.expected_seek_distance += seekdist

        new.proc.expected_seek_direction *= 0.74   // used by C-LOOK heuristic
        new.proc.expected_seek_direction += seekdirection

        new.proc.expected_positioning_time =
              calculate_postime(new.proc.expected_seek_distance)
                // we use an approximation of positioning time based on seek
                // distance; an exact version would need some more arguments


    sched_evaluate(last, next):

        next_seek_distance = ABS(next.location - last.location)
        next_seek_direction = next.location > last.location;
        next_positioning_time = calculate_postime(next_seek_distance)

||<-- ADDITIONAL CHECK FOR THE C-LOOK HEURISTIC ONLY
||
||      if (next_seek_direction == 1 /* ahead */
||      |   && last.proc.expected_seek_direction < 0.3 /* behind */)
||      |____   return 0                        // proceed immediately
||
||      if (next_seek_direction == 0 /* behind */
||      |   && last.proc.expected_seek_direction > 0.7 /* ahead */)
||      |____   return max(last.proc.expected_most_thinktime - elapsed, 0)

        // time elapsed since we finished servicing previous request.
        // this time needs to be deducted from both expected_thinktimes.
        elapsed = NOW - last.proc.prev_finish_time;
        last_expected_time = last.proc.expected_positioning_time +
                             max(last.proc.expected_thinktime - elapsed, 0)

        if (next_positioning_time < last_expected_time)
        |   return 0; // no point waiting; proceed immediately with "next"
        else
        |   return max(last.proc.expected_most_thinktime - elapsed, 0)
        |     // If we wait for 95 percentile of thinktime, then with 95%
        |     // probability, we would expect to receive the next request
        |____ // before we timeout. This time is bounded by 15ms.
1.5

III.  NWCS heuristic for Stride

1.0
    sched_collect_stats(new): reduced version of (or same as) that for SPTF

    sched_evaluate(last, next): // PROCESS VERSION

        minclock = min{ proc[i].clock, where (proc[i].pending > 0) }
        elapsed = NOW - last.proc.prev_finish_time;

        if (last.proc.pending == 0
        |   && last.proc.clock < minclock
        |   && max(last.proc.expected_thinktime - elapsed, 0) < 3ms)
        |      return max(last.proc.expected_thinktime - elapsed, 0)
        else
        |____  return 0;

    sched_evaluate(last, next): // RESOURCE CONTAINER VERSION
        
        minclock = min{ rc[i].clock, where (rc[i].pending > 0) }
        elapsed = NOW - last.proc.prev_finish_time;

        if (last.proc.rc.pending == 0
        |   && last.proc.rc.clock < minclock
        |   && max(last.proc.expected_thinktime - elapsed, 0) < 3ms)
        |      return max(last.proc.expected_thinktime - elapsed, 0)
        else
        |____  return 0;
1.5



Sitaram Iyer ssiyer@cs.rice.edu