1 files changed, 213 insertions, 72 deletions
diff --git a/sys/kern/kern_sched.c b/sys/kern/kern_sched.c
index 4bbe5a0..9c5e215 100644
--- a/sys/kern/kern_sched.c
+++ b/sys/kern/kern_sched.c
@@ -34,6 +34,7 @@
 #include <sys/param.h>
 #include <sys/syslog.h>
 #include <sys/atomic.h>
+#include <dev/cons/cons.h>
 #include <machine/frame.h>
 #include <machine/cpu.h>
 #include <machine/cdefs.h>
@@ -44,7 +45,8 @@
 
 #define pr_trace(fmt, ...) kprintf("ksched: " fmt, ##__VA_ARGS__)
 
-void sched_switch(struct trapframe *tf);
+void md_sched_switch(struct trapframe *tf);
+void sched_accnt_init(void);
 
 static sched_policy_t policy = SCHED_POLICY_MLFQ;
 
@@ -63,7 +65,7 @@ __cacheline_aligned static struct spinlock tdq_lock = {0};
 /*
  * Perform timer oneshot
  */
-static inline void
+void
 sched_oneshot(bool now)
 {
     struct timer timer;
@@ -77,39 +79,75 @@ sched_oneshot(bool now)
 }
 
 /*
- * Save thread state and enqueue it back into one
- * of the ready queues.
+ * Returns true if a processor is associated
+ * with a specific thread
+ *
+ * @ci: CPU that wants to take 'td'
+ * @td: Thread to check against
  */
-static void
-sched_save_td(struct proc *td, struct trapframe *tf)
+static bool
+cpu_is_assoc(struct cpu_info *ci, struct proc *td)
 {
     /*
-     * Save trapframe to process structure only
-     * if PROC_EXEC is not set.
+     * If we are not pinned, any processor is
+     * associated.
      */
-    if (!ISSET(td->flags, PROC_EXEC)) {
-        memcpy(&td->tf, tf, sizeof(td->tf));
+    if (!ISSET(td->flags, PROC_PINNED)) {
+        return true;
     }
 
-    sched_enqueue_td(td);
+    return ci->id == td->affinity;
 }
 
-static struct proc *
+struct proc *
 sched_dequeue_td(void)
 {
     struct sched_queue *queue;
     struct proc *td = NULL;
+    struct cpu_info *ci;
+    uint32_t ncpu = 0;
 
     spinlock_acquire(&tdq_lock);
+    ci = this_cpu();
 
     for (size_t i = 0; i < SCHED_NQUEUE; ++i) {
         queue = &qlist[i];
-        if (!TAILQ_EMPTY(&queue->q)) {
-            td = TAILQ_FIRST(&queue->q);
-            TAILQ_REMOVE(&queue->q, td, link);
-            spinlock_release(&tdq_lock);
-            return td;
+        if (TAILQ_EMPTY(&queue->q)) {
+            continue;
         }
+
+        td = TAILQ_FIRST(&queue->q);
+        if (td == NULL) {
+            continue;
+        }
+
+        while (ISSET(td->flags, PROC_SLEEP)) {
+            td = TAILQ_NEXT(td, link);
+            if (td == NULL) {
+                break;
+            }
+        }
+
+        /*
+         * If we are on a multicore system and this isn't
+         * our process, don't take it. Some threads might
+         * be pinned to a specific processor.
+         */
+        ncpu = cpu_count();
+        while (!cpu_is_assoc(ci, td) && ncpu > 1) {
+            td = TAILQ_NEXT(td, link);
+            if (td == NULL) {
+                break;
+            }
+        }
+
+        if (td == NULL) {
+            continue;
+        }
+
+        TAILQ_REMOVE(&queue->q, td, link);
+        spinlock_release(&tdq_lock);
+        return td;
     }
 
     /* We got nothing */
@@ -141,6 +179,9 @@ this_td(void)
     struct cpu_info *ci;
 
     ci = this_cpu();
+    if (ci == NULL) {
+        return NULL;
+    }
     return ci->curtd;
 }
 
@@ -177,62 +218,21 @@ td_pri_update(struct proc *td)
 }
 
 /*
- * Perform a context switch.
+ * MI work to be done during a context
+ * switch. Called by md_sched_switch()
  */
 void
-sched_switch(struct trapframe *tf)
+mi_sched_switch(struct proc *from)
 {
-    struct cpu_info *ci;
-    struct pcb *pcbp;
-    struct proc *next_td, *td;
-    bool use_current = true;
-
-    ci = this_cpu();
-    td = ci->curtd;
-
-    if (td != NULL) {
-        dispatch_signals(td);
-        td_pri_update(td);
-    }
-
-    /*
-     * Get the next thread and use it only if it isn't
-     * in the middle of an exit, exec, or whatever.
-     */
-    do {
-        if ((next_td = sched_dequeue_td()) == NULL) {
-            sched_oneshot(false);
+    if (from != NULL) {
+        if (from->pid == 0)
             return;
-        }
 
-        /*
-         * If we are in the middle of an exec, don't use this
-         * thread.
-         */
-        if (ISSET(next_td->flags, PROC_EXEC)) {
-            use_current = false;
-        }
-
-        /*
-         * Don't use this thread if we are currently
-         * exiting.
-         */
-        if (ISSET(next_td->flags, PROC_EXITING)) {
-            use_current = false;
-        }
-    } while (!use_current);
-
-    /* Save the previous thread */
-    if (td != NULL) {
-        sched_save_td(td, tf);
+        dispatch_signals(from);
+        td_pri_update(from);
     }
 
-    memcpy(tf, &next_td->tf, sizeof(*tf));
-    ci->curtd = next_td;
-    pcbp = &next_td->pcb;
-
-    pmap_switch_vas(pcbp->addrsp);
-    sched_oneshot(false);
+    cons_detach();
 }
 
 /*
@@ -242,9 +242,8 @@ void
 sched_enter(void)
 {
     md_inton();
-    md_sync_all();
+    sched_oneshot(false);
     for (;;) {
-        sched_oneshot(false);
         md_pause();
     }
 }
@@ -252,14 +251,154 @@ sched_enter(void)
 void
 sched_yield(void)
 {
-    struct proc *td = this_td();
+    struct proc *td;
+    struct cpu_info *ci = this_cpu();
 
-    if (td != NULL) {
-        td->rested = true;
+    if ((td = ci->curtd) == NULL) {
+        return;
     }
 
+    td->rested = true;
+
+    /* FIXME: Hang yielding when waited on */
+    if (ISSET(td->flags, PROC_WAITED)) {
+        return;
+    }
+
+    ci->curtd = NULL;
+    md_inton();
     sched_oneshot(false);
-    while (td->rested);
+
+    md_hlt();
+    md_intoff();
+    ci->curtd = td;
+}
+
+void
+sched_detach(struct proc *td)
+{
+    struct sched_queue *queue;
+
+    spinlock_acquire(&tdq_lock);
+    queue = &qlist[td->priority];
+
+    TAILQ_REMOVE(&queue->q, td, link);
+    spinlock_release(&tdq_lock);
+}
+
+/*
+ * Pin a process to a specific processor
+ *
+ * @td: Process to pin
+ * @cpu: Logical processor ID to pin `td' to.
+ *
+ * XXX: 'cpu' is a machine independent value, representing
+ *      CPU<n>
+ */
+void
+proc_pin(struct proc *td, affinity_t cpu)
+{
+    td->affinity = cpu;
+    td->flags |= PROC_PINNED;
+}
+
+/*
+ * Unpin a pinned process, allowing it to be
+ * picked up by any processor
+ *
+ * @td: Process to unpin
+ */
+void
+proc_unpin(struct proc *td)
+{
+    td->affinity = 0;
+    td->flags &= ~PROC_PINNED;
+}
+
+/*
+ * Suspend a process for a specified amount
+ * of time. This calling process will yield for
+ * the amount of time specified in 'tv'
+ *
+ * @td: Process to suspend (NULL for current)
+ * @tv: Time value to use
+ *
+ * XXX: 'tv' being NULL is equivalent to calling
+ *      sched_detach()
+ */
+void
+sched_suspend(struct proc *td, const struct timeval *tv)
+{
+    struct timer tmr;
+    const time_t USEC_PER_SEC = 1000000;
+    ssize_t usec;
+    time_t usec_cur, usec_tmp;
+    bool have_timer = true;
+    tmrr_status_t tmr_status;
+
+    if (td == NULL)
+        td = this_td();
+    if (__unlikely(td == NULL))
+        return;
+
+    if (tv == NULL) {
+        sched_detach(td);
+        return;
+    }
+
+    /*
+     * Now, we need a generic timer so that we can compute
+     * how much time has elapsed since this process has
+     * requested to be suspended. However, we cannot assume
+     * that it would be present. If the lookup fails, all we
+     * can do is try to estimate how much time went by which
+     * works fine too, just not as accurate.
+     */
+    tmr_status = req_timer(TIMER_GP, &tmr);
+    if (tmr_status != TMRR_SUCCESS) {
+        have_timer = false;
+    }
+
+    /* We need microsecond precision */
+    if (tmr.get_time_sec == NULL) {
+        have_timer = false;
+    }
+
+    /*
+     * Compute the max time in microseconds that
+     * we will wait. We are using both tv->tv_sec
+     * and tv->tv_usec
+     */
+    usec = tv->tv_usec;
+    usec += tv->tv_sec * USEC_PER_SEC;
+    usec_cur = (have_timer) ? tmr.get_time_usec() : 0;
+
+    for (;;) {
+        sched_yield();
+
+        /*
+         * If we have a timer in our paws, compute how much
+         * time went by. Otherwise we estimate by subtracting
+         * the scheduler quantum.
+         *
+         * XXX: The timing here works decently as intended. However,
+         *      it would be nice to smoothen out any jitter. Such can
+         *      probably be done by subtracting 'usec' by the exponential
+         *      moving average of 'usec_tmp' rather than the raw original
+         *      value.
+         */
+        if (have_timer) {
+            usec_tmp = (tmr.get_time_usec() - usec_cur);
+        } else {
+            usec_tmp = DEFAULT_TIMESLICE_USEC;
+        }
+
+        /* We are done here! */
+        usec -= usec_tmp;
+        if (usec <= 0) {
+            break;
+        }
+    }
 }
 
 void
@@ -272,4 +411,6 @@ sched_init(void)
 
     pr_trace("prepared %d queues (policy=0x%x)\n",
         SCHED_NQUEUE, policy);
+
+    sched_accnt_init();
 }