| 1 | /**************************************************************************** |
|---|
| 2 | * (C) 2002-2003 - Rolf Neugebauer - Intel Research Cambridge |
|---|
| 3 | * (C) 2002-2003 University of Cambridge |
|---|
| 4 | * (C) 2004 - Mark Williamson - Intel Research Cambridge |
|---|
| 5 | **************************************************************************** |
|---|
| 6 | * |
|---|
| 7 | * File: common/schedule.c |
|---|
| 8 | * Author: Rolf Neugebauer & Keir Fraser |
|---|
| 9 | * Updated for generic API by Mark Williamson |
|---|
| 10 | * |
|---|
| 11 | * Description: Generic CPU scheduling code |
|---|
| 12 | * implements support functionality for the Xen scheduler API. |
|---|
| 13 | * |
|---|
| 14 | */ |
|---|
| 15 | |
|---|
| 16 | #ifndef COMPAT |
|---|
| 17 | #include <xen/config.h> |
|---|
| 18 | #include <xen/init.h> |
|---|
| 19 | #include <xen/lib.h> |
|---|
| 20 | #include <xen/sched.h> |
|---|
| 21 | #include <xen/domain.h> |
|---|
| 22 | #include <xen/delay.h> |
|---|
| 23 | #include <xen/event.h> |
|---|
| 24 | #include <xen/time.h> |
|---|
| 25 | #include <xen/timer.h> |
|---|
| 26 | #include <xen/perfc.h> |
|---|
| 27 | #include <xen/sched-if.h> |
|---|
| 28 | #include <xen/softirq.h> |
|---|
| 29 | #include <xen/trace.h> |
|---|
| 30 | #include <xen/mm.h> |
|---|
| 31 | #include <xen/errno.h> |
|---|
| 32 | #include <xen/guest_access.h> |
|---|
| 33 | #include <xen/multicall.h> |
|---|
| 34 | #include <public/sched.h> |
|---|
| 35 | |
|---|
| 36 | /* opt_sched: scheduler - default to credit */ |
|---|
| 37 | static char opt_sched[10] = "credit"; |
|---|
| 38 | string_param("sched", opt_sched); |
|---|
| 39 | |
|---|
| 40 | /* opt_dom0_vcpus_pin: If true, dom0 VCPUs are pinned. */ |
|---|
| 41 | static unsigned int opt_dom0_vcpus_pin; |
|---|
| 42 | boolean_param("dom0_vcpus_pin", opt_dom0_vcpus_pin); |
|---|
| 43 | |
|---|
| 44 | #define TIME_SLOP (s32)MICROSECS(50) /* allow time to slip a bit */ |
|---|
| 45 | |
|---|
| 46 | /* Various timer handlers. */ |
|---|
| 47 | static void s_timer_fn(void *unused); |
|---|
| 48 | static void vcpu_periodic_timer_fn(void *data); |
|---|
| 49 | static void vcpu_singleshot_timer_fn(void *data); |
|---|
| 50 | static void poll_timer_fn(void *data); |
|---|
| 51 | |
|---|
| 52 | /* This is global for now so that private implementations can reach it */ |
|---|
| 53 | DEFINE_PER_CPU(struct schedule_data, schedule_data); |
|---|
| 54 | |
|---|
| 55 | extern struct scheduler sched_sedf_def; |
|---|
| 56 | extern struct scheduler sched_credit_def; |
|---|
| 57 | static struct scheduler *schedulers[] = { |
|---|
| 58 | &sched_sedf_def, |
|---|
| 59 | &sched_credit_def, |
|---|
| 60 | NULL |
|---|
| 61 | }; |
|---|
| 62 | |
|---|
| 63 | static struct scheduler ops; |
|---|
| 64 | |
|---|
| 65 | #define SCHED_OP(fn, ...) \ |
|---|
| 66 | (( ops.fn != NULL ) ? ops.fn( __VA_ARGS__ ) \ |
|---|
| 67 | : (typeof(ops.fn(__VA_ARGS__)))0 ) |
|---|
| 68 | |
|---|
| 69 | static inline void vcpu_runstate_change( |
|---|
| 70 | struct vcpu *v, int new_state, s_time_t new_entry_time) |
|---|
| 71 | { |
|---|
| 72 | ASSERT(v->runstate.state != new_state); |
|---|
| 73 | ASSERT(spin_is_locked(&per_cpu(schedule_data,v->processor).schedule_lock)); |
|---|
| 74 | |
|---|
| 75 | v->runstate.time[v->runstate.state] += |
|---|
| 76 | new_entry_time - v->runstate.state_entry_time; |
|---|
| 77 | v->runstate.state_entry_time = new_entry_time; |
|---|
| 78 | v->runstate.state = new_state; |
|---|
| 79 | } |
|---|
| 80 | |
|---|
| 81 | void vcpu_runstate_get(struct vcpu *v, struct vcpu_runstate_info *runstate) |
|---|
| 82 | { |
|---|
| 83 | if ( likely(v == current) ) |
|---|
| 84 | { |
|---|
| 85 | /* Fast lock-free path. */ |
|---|
| 86 | memcpy(runstate, &v->runstate, sizeof(*runstate)); |
|---|
| 87 | ASSERT(runstate->state == RUNSTATE_running); |
|---|
| 88 | runstate->time[RUNSTATE_running] += NOW() - runstate->state_entry_time; |
|---|
| 89 | } |
|---|
| 90 | else |
|---|
| 91 | { |
|---|
| 92 | vcpu_schedule_lock_irq(v); |
|---|
| 93 | memcpy(runstate, &v->runstate, sizeof(*runstate)); |
|---|
| 94 | runstate->time[runstate->state] += NOW() - runstate->state_entry_time; |
|---|
| 95 | vcpu_schedule_unlock_irq(v); |
|---|
| 96 | } |
|---|
| 97 | } |
|---|
| 98 | |
|---|
| 99 | int sched_init_vcpu(struct vcpu *v, unsigned int processor) |
|---|
| 100 | { |
|---|
| 101 | struct domain *d = v->domain; |
|---|
| 102 | |
|---|
| 103 | /* |
|---|
| 104 | * Initialize processor and affinity settings. The idler, and potentially |
|---|
| 105 | * domain-0 VCPUs, are pinned onto their respective physical CPUs. |
|---|
| 106 | */ |
|---|
| 107 | v->processor = processor; |
|---|
| 108 | if ( is_idle_domain(d) || ((d->domain_id == 0) && opt_dom0_vcpus_pin) ) |
|---|
| 109 | v->cpu_affinity = cpumask_of_cpu(processor); |
|---|
| 110 | else |
|---|
| 111 | cpus_setall(v->cpu_affinity); |
|---|
| 112 | |
|---|
| 113 | /* Initialise the per-vcpu timers. */ |
|---|
| 114 | init_timer(&v->periodic_timer, vcpu_periodic_timer_fn, |
|---|
| 115 | v, v->processor); |
|---|
| 116 | init_timer(&v->singleshot_timer, vcpu_singleshot_timer_fn, |
|---|
| 117 | v, v->processor); |
|---|
| 118 | init_timer(&v->poll_timer, poll_timer_fn, |
|---|
| 119 | v, v->processor); |
|---|
| 120 | |
|---|
| 121 | /* Idle VCPUs are scheduled immediately. */ |
|---|
| 122 | if ( is_idle_domain(d) ) |
|---|
| 123 | { |
|---|
| 124 | per_cpu(schedule_data, v->processor).curr = v; |
|---|
| 125 | per_cpu(schedule_data, v->processor).idle = v; |
|---|
| 126 | v->is_running = 1; |
|---|
| 127 | } |
|---|
| 128 | |
|---|
| 129 | TRACE_2D(TRC_SCHED_DOM_ADD, v->domain->domain_id, v->vcpu_id); |
|---|
| 130 | |
|---|
| 131 | return SCHED_OP(init_vcpu, v); |
|---|
| 132 | } |
|---|
| 133 | |
|---|
| 134 | void sched_destroy_vcpu(struct vcpu *v) |
|---|
| 135 | { |
|---|
| 136 | kill_timer(&v->periodic_timer); |
|---|
| 137 | kill_timer(&v->singleshot_timer); |
|---|
| 138 | kill_timer(&v->poll_timer); |
|---|
| 139 | SCHED_OP(destroy_vcpu, v); |
|---|
| 140 | } |
|---|
| 141 | |
|---|
| 142 | int sched_init_domain(struct domain *d) |
|---|
| 143 | { |
|---|
| 144 | return SCHED_OP(init_domain, d); |
|---|
| 145 | } |
|---|
| 146 | |
|---|
| 147 | void sched_destroy_domain(struct domain *d) |
|---|
| 148 | { |
|---|
| 149 | SCHED_OP(destroy_domain, d); |
|---|
| 150 | } |
|---|
| 151 | |
|---|
| 152 | void vcpu_sleep_nosync(struct vcpu *v) |
|---|
| 153 | { |
|---|
| 154 | unsigned long flags; |
|---|
| 155 | |
|---|
| 156 | vcpu_schedule_lock_irqsave(v, flags); |
|---|
| 157 | |
|---|
| 158 | if ( likely(!vcpu_runnable(v)) ) |
|---|
| 159 | { |
|---|
| 160 | if ( v->runstate.state == RUNSTATE_runnable ) |
|---|
| 161 | vcpu_runstate_change(v, RUNSTATE_offline, NOW()); |
|---|
| 162 | |
|---|
| 163 | SCHED_OP(sleep, v); |
|---|
| 164 | } |
|---|
| 165 | |
|---|
| 166 | vcpu_schedule_unlock_irqrestore(v, flags); |
|---|
| 167 | |
|---|
| 168 | TRACE_2D(TRC_SCHED_SLEEP, v->domain->domain_id, v->vcpu_id); |
|---|
| 169 | } |
|---|
| 170 | |
|---|
| 171 | void vcpu_sleep_sync(struct vcpu *v) |
|---|
| 172 | { |
|---|
| 173 | vcpu_sleep_nosync(v); |
|---|
| 174 | |
|---|
| 175 | while ( !vcpu_runnable(v) && v->is_running ) |
|---|
| 176 | cpu_relax(); |
|---|
| 177 | |
|---|
| 178 | sync_vcpu_execstate(v); |
|---|
| 179 | } |
|---|
| 180 | |
|---|
| 181 | void vcpu_wake(struct vcpu *v) |
|---|
| 182 | { |
|---|
| 183 | unsigned long flags; |
|---|
| 184 | |
|---|
| 185 | vcpu_schedule_lock_irqsave(v, flags); |
|---|
| 186 | |
|---|
| 187 | if ( likely(vcpu_runnable(v)) ) |
|---|
| 188 | { |
|---|
| 189 | if ( v->runstate.state >= RUNSTATE_blocked ) |
|---|
| 190 | vcpu_runstate_change(v, RUNSTATE_runnable, NOW()); |
|---|
| 191 | SCHED_OP(wake, v); |
|---|
| 192 | } |
|---|
| 193 | else if ( !test_bit(_VPF_blocked, &v->pause_flags) ) |
|---|
| 194 | { |
|---|
| 195 | if ( v->runstate.state == RUNSTATE_blocked ) |
|---|
| 196 | vcpu_runstate_change(v, RUNSTATE_offline, NOW()); |
|---|
| 197 | } |
|---|
| 198 | |
|---|
| 199 | vcpu_schedule_unlock_irqrestore(v, flags); |
|---|
| 200 | |
|---|
| 201 | TRACE_2D(TRC_SCHED_WAKE, v->domain->domain_id, v->vcpu_id); |
|---|
| 202 | } |
|---|
| 203 | |
|---|
| 204 | static void vcpu_migrate(struct vcpu *v) |
|---|
| 205 | { |
|---|
| 206 | unsigned long flags; |
|---|
| 207 | int old_cpu; |
|---|
| 208 | |
|---|
| 209 | vcpu_schedule_lock_irqsave(v, flags); |
|---|
| 210 | |
|---|
| 211 | /* |
|---|
| 212 | * NB. Check of v->running happens /after/ setting migration flag |
|---|
| 213 | * because they both happen in (different) spinlock regions, and those |
|---|
| 214 | * regions are strictly serialised. |
|---|
| 215 | */ |
|---|
| 216 | if ( v->is_running || |
|---|
| 217 | !test_and_clear_bit(_VPF_migrating, &v->pause_flags) ) |
|---|
| 218 | { |
|---|
| 219 | vcpu_schedule_unlock_irqrestore(v, flags); |
|---|
| 220 | return; |
|---|
| 221 | } |
|---|
| 222 | |
|---|
| 223 | /* Switch to new CPU, then unlock old CPU. */ |
|---|
| 224 | old_cpu = v->processor; |
|---|
| 225 | v->processor = SCHED_OP(pick_cpu, v); |
|---|
| 226 | spin_unlock_irqrestore( |
|---|
| 227 | &per_cpu(schedule_data, old_cpu).schedule_lock, flags); |
|---|
| 228 | |
|---|
| 229 | /* Wake on new CPU. */ |
|---|
| 230 | vcpu_wake(v); |
|---|
| 231 | } |
|---|
| 232 | |
|---|
| 233 | /* |
|---|
| 234 | * Force a VCPU through a deschedule/reschedule path. |
|---|
| 235 | * For example, using this when setting the periodic timer period means that |
|---|
| 236 | * most periodic-timer state need only be touched from within the scheduler |
|---|
| 237 | * which can thus be done without need for synchronisation. |
|---|
| 238 | */ |
|---|
| 239 | void vcpu_force_reschedule(struct vcpu *v) |
|---|
| 240 | { |
|---|
| 241 | vcpu_schedule_lock_irq(v); |
|---|
| 242 | if ( v->is_running ) |
|---|
| 243 | set_bit(_VPF_migrating, &v->pause_flags); |
|---|
| 244 | vcpu_schedule_unlock_irq(v); |
|---|
| 245 | |
|---|
| 246 | if ( test_bit(_VPF_migrating, &v->pause_flags) ) |
|---|
| 247 | { |
|---|
| 248 | vcpu_sleep_nosync(v); |
|---|
| 249 | vcpu_migrate(v); |
|---|
| 250 | } |
|---|
| 251 | } |
|---|
| 252 | |
|---|
| 253 | int vcpu_set_affinity(struct vcpu *v, cpumask_t *affinity) |
|---|
| 254 | { |
|---|
| 255 | cpumask_t online_affinity; |
|---|
| 256 | |
|---|
| 257 | if ( (v->domain->domain_id == 0) && opt_dom0_vcpus_pin ) |
|---|
| 258 | return -EINVAL; |
|---|
| 259 | |
|---|
| 260 | cpus_and(online_affinity, *affinity, cpu_online_map); |
|---|
| 261 | if ( cpus_empty(online_affinity) ) |
|---|
| 262 | return -EINVAL; |
|---|
| 263 | |
|---|
| 264 | vcpu_schedule_lock_irq(v); |
|---|
| 265 | |
|---|
| 266 | v->cpu_affinity = *affinity; |
|---|
| 267 | if ( !cpu_isset(v->processor, v->cpu_affinity) ) |
|---|
| 268 | set_bit(_VPF_migrating, &v->pause_flags); |
|---|
| 269 | |
|---|
| 270 | vcpu_schedule_unlock_irq(v); |
|---|
| 271 | |
|---|
| 272 | if ( test_bit(_VPF_migrating, &v->pause_flags) ) |
|---|
| 273 | { |
|---|
| 274 | vcpu_sleep_nosync(v); |
|---|
| 275 | vcpu_migrate(v); |
|---|
| 276 | } |
|---|
| 277 | |
|---|
| 278 | return 0; |
|---|
| 279 | } |
|---|
| 280 | |
|---|
| 281 | /* Block the currently-executing domain until a pertinent event occurs. */ |
|---|
| 282 | static long do_block(void) |
|---|
| 283 | { |
|---|
| 284 | struct vcpu *v = current; |
|---|
| 285 | |
|---|
| 286 | local_event_delivery_enable(); |
|---|
| 287 | set_bit(_VPF_blocked, &v->pause_flags); |
|---|
| 288 | |
|---|
| 289 | /* Check for events /after/ blocking: avoids wakeup waiting race. */ |
|---|
| 290 | if ( local_events_need_delivery() ) |
|---|
| 291 | { |
|---|
| 292 | clear_bit(_VPF_blocked, &v->pause_flags); |
|---|
| 293 | } |
|---|
| 294 | else |
|---|
| 295 | { |
|---|
| 296 | TRACE_2D(TRC_SCHED_BLOCK, v->domain->domain_id, v->vcpu_id); |
|---|
| 297 | raise_softirq(SCHEDULE_SOFTIRQ); |
|---|
| 298 | } |
|---|
| 299 | |
|---|
| 300 | return 0; |
|---|
| 301 | } |
|---|
| 302 | |
|---|
| 303 | static long do_poll(struct sched_poll *sched_poll) |
|---|
| 304 | { |
|---|
| 305 | struct vcpu *v = current; |
|---|
| 306 | struct domain *d = v->domain; |
|---|
| 307 | evtchn_port_t port; |
|---|
| 308 | long rc = 0; |
|---|
| 309 | unsigned int i; |
|---|
| 310 | |
|---|
| 311 | /* Fairly arbitrary limit. */ |
|---|
| 312 | if ( sched_poll->nr_ports > 128 ) |
|---|
| 313 | return -EINVAL; |
|---|
| 314 | |
|---|
| 315 | if ( !guest_handle_okay(sched_poll->ports, sched_poll->nr_ports) ) |
|---|
| 316 | return -EFAULT; |
|---|
| 317 | |
|---|
| 318 | set_bit(_VPF_blocked, &v->pause_flags); |
|---|
| 319 | v->is_polling = 1; |
|---|
| 320 | d->is_polling = 1; |
|---|
| 321 | |
|---|
| 322 | /* Check for events /after/ setting flags: avoids wakeup waiting race. */ |
|---|
| 323 | smp_wmb(); |
|---|
| 324 | |
|---|
| 325 | for ( i = 0; i < sched_poll->nr_ports; i++ ) |
|---|
| 326 | { |
|---|
| 327 | rc = -EFAULT; |
|---|
| 328 | if ( __copy_from_guest_offset(&port, sched_poll->ports, i, 1) ) |
|---|
| 329 | goto out; |
|---|
| 330 | |
|---|
| 331 | rc = -EINVAL; |
|---|
| 332 | if ( port >= MAX_EVTCHNS(d) ) |
|---|
| 333 | goto out; |
|---|
| 334 | |
|---|
| 335 | rc = 0; |
|---|
| 336 | if ( test_bit(port, shared_info_addr(d, evtchn_pending)) ) |
|---|
| 337 | goto out; |
|---|
| 338 | } |
|---|
| 339 | |
|---|
| 340 | if ( sched_poll->timeout != 0 ) |
|---|
| 341 | set_timer(&v->poll_timer, sched_poll->timeout); |
|---|
| 342 | |
|---|
| 343 | TRACE_2D(TRC_SCHED_BLOCK, d->domain_id, v->vcpu_id); |
|---|
| 344 | raise_softirq(SCHEDULE_SOFTIRQ); |
|---|
| 345 | |
|---|
| 346 | return 0; |
|---|
| 347 | |
|---|
| 348 | out: |
|---|
| 349 | v->is_polling = 0; |
|---|
| 350 | clear_bit(_VPF_blocked, &v->pause_flags); |
|---|
| 351 | return rc; |
|---|
| 352 | } |
|---|
| 353 | |
|---|
| 354 | /* Voluntarily yield the processor for this allocation. */ |
|---|
| 355 | static long do_yield(void) |
|---|
| 356 | { |
|---|
| 357 | TRACE_2D(TRC_SCHED_YIELD, current->domain->domain_id, current->vcpu_id); |
|---|
| 358 | raise_softirq(SCHEDULE_SOFTIRQ); |
|---|
| 359 | return 0; |
|---|
| 360 | } |
|---|
| 361 | |
|---|
| 362 | long do_sched_op_compat(int cmd, unsigned long arg) |
|---|
| 363 | { |
|---|
| 364 | long ret = 0; |
|---|
| 365 | |
|---|
| 366 | switch ( cmd ) |
|---|
| 367 | { |
|---|
| 368 | case SCHEDOP_yield: |
|---|
| 369 | { |
|---|
| 370 | ret = do_yield(); |
|---|
| 371 | break; |
|---|
| 372 | } |
|---|
| 373 | |
|---|
| 374 | case SCHEDOP_block: |
|---|
| 375 | { |
|---|
| 376 | ret = do_block(); |
|---|
| 377 | break; |
|---|
| 378 | } |
|---|
| 379 | |
|---|
| 380 | case SCHEDOP_shutdown: |
|---|
| 381 | { |
|---|
| 382 | TRACE_3D(TRC_SCHED_SHUTDOWN, |
|---|
| 383 | current->domain->domain_id, current->vcpu_id, arg); |
|---|
| 384 | domain_shutdown(current->domain, (u8)arg); |
|---|
| 385 | break; |
|---|
| 386 | } |
|---|
| 387 | |
|---|
| 388 | default: |
|---|
| 389 | ret = -ENOSYS; |
|---|
| 390 | } |
|---|
| 391 | |
|---|
| 392 | return ret; |
|---|
| 393 | } |
|---|
| 394 | |
|---|
| 395 | typedef long ret_t; |
|---|
| 396 | |
|---|
| 397 | #endif /* !COMPAT */ |
|---|
| 398 | |
|---|
| 399 | ret_t do_sched_op(int cmd, XEN_GUEST_HANDLE(void) arg) |
|---|
| 400 | { |
|---|
| 401 | ret_t ret = 0; |
|---|
| 402 | |
|---|
| 403 | switch ( cmd ) |
|---|
| 404 | { |
|---|
| 405 | case SCHEDOP_yield: |
|---|
| 406 | { |
|---|
| 407 | ret = do_yield(); |
|---|
| 408 | break; |
|---|
| 409 | } |
|---|
| 410 | |
|---|
| 411 | case SCHEDOP_block: |
|---|
| 412 | { |
|---|
| 413 | ret = do_block(); |
|---|
| 414 | break; |
|---|
| 415 | } |
|---|
| 416 | |
|---|
| 417 | case SCHEDOP_shutdown: |
|---|
| 418 | { |
|---|
| 419 | struct sched_shutdown sched_shutdown; |
|---|
| 420 | |
|---|
| 421 | ret = -EFAULT; |
|---|
| 422 | if ( copy_from_guest(&sched_shutdown, arg, 1) ) |
|---|
| 423 | break; |
|---|
| 424 | |
|---|
| 425 | ret = 0; |
|---|
| 426 | TRACE_3D(TRC_SCHED_SHUTDOWN, |
|---|
| 427 | current->domain->domain_id, current->vcpu_id, |
|---|
| 428 | sched_shutdown.reason); |
|---|
| 429 | domain_shutdown(current->domain, (u8)sched_shutdown.reason); |
|---|
| 430 | |
|---|
| 431 | break; |
|---|
| 432 | } |
|---|
| 433 | |
|---|
| 434 | case SCHEDOP_poll: |
|---|
| 435 | { |
|---|
| 436 | struct sched_poll sched_poll; |
|---|
| 437 | |
|---|
| 438 | ret = -EFAULT; |
|---|
| 439 | if ( copy_from_guest(&sched_poll, arg, 1) ) |
|---|
| 440 | break; |
|---|
| 441 | |
|---|
| 442 | ret = do_poll(&sched_poll); |
|---|
| 443 | |
|---|
| 444 | break; |
|---|
| 445 | } |
|---|
| 446 | |
|---|
| 447 | case SCHEDOP_remote_shutdown: |
|---|
| 448 | { |
|---|
| 449 | struct domain *d; |
|---|
| 450 | struct sched_remote_shutdown sched_remote_shutdown; |
|---|
| 451 | |
|---|
| 452 | if ( !IS_PRIV(current->domain) ) |
|---|
| 453 | return -EPERM; |
|---|
| 454 | |
|---|
| 455 | ret = -EFAULT; |
|---|
| 456 | if ( copy_from_guest(&sched_remote_shutdown, arg, 1) ) |
|---|
| 457 | break; |
|---|
| 458 | |
|---|
| 459 | ret = -ESRCH; |
|---|
| 460 | d = rcu_lock_domain_by_id(sched_remote_shutdown.domain_id); |
|---|
| 461 | if ( d == NULL ) |
|---|
| 462 | break; |
|---|
| 463 | |
|---|
| 464 | /* domain_pause() prevens any further execution in guest context. */ |
|---|
| 465 | domain_pause(d); |
|---|
| 466 | domain_shutdown(d, (u8)sched_remote_shutdown.reason); |
|---|
| 467 | domain_unpause(d); |
|---|
| 468 | |
|---|
| 469 | rcu_unlock_domain(d); |
|---|
| 470 | ret = 0; |
|---|
| 471 | |
|---|
| 472 | break; |
|---|
| 473 | } |
|---|
| 474 | |
|---|
| 475 | default: |
|---|
| 476 | ret = -ENOSYS; |
|---|
| 477 | } |
|---|
| 478 | |
|---|
| 479 | return ret; |
|---|
| 480 | } |
|---|
| 481 | |
|---|
| 482 | #ifndef COMPAT |
|---|
| 483 | |
|---|
| 484 | /* Per-vcpu oneshot-timer hypercall. */ |
|---|
| 485 | long do_set_timer_op(s_time_t timeout) |
|---|
| 486 | { |
|---|
| 487 | struct vcpu *v = current; |
|---|
| 488 | s_time_t offset = timeout - NOW(); |
|---|
| 489 | |
|---|
| 490 | if ( timeout == 0 ) |
|---|
| 491 | { |
|---|
| 492 | stop_timer(&v->singleshot_timer); |
|---|
| 493 | } |
|---|
| 494 | else if ( unlikely(timeout < 0) || /* overflow into 64th bit? */ |
|---|
| 495 | unlikely((offset > 0) && ((uint32_t)(offset >> 50) != 0)) ) |
|---|
| 496 | { |
|---|
| 497 | /* |
|---|
| 498 | * Linux workaround: occasionally we will see timeouts a long way in |
|---|
| 499 | * the future due to wrapping in Linux's jiffy time handling. We check |
|---|
| 500 | * for timeouts wrapped negative, and for positive timeouts more than |
|---|
| 501 | * about 13 days in the future (2^50ns). The correct fix is to trigger |
|---|
| 502 | * an interrupt immediately (since Linux in fact has pending work to |
|---|
| 503 | * do in this situation). However, older guests also set a long timeout |
|---|
| 504 | * when they have *no* pending timers at all: setting an immediate |
|---|
| 505 | * timeout in this case can burn a lot of CPU. We therefore go for a |
|---|
| 506 | * reasonable middleground of triggering a timer event in 100ms. |
|---|
| 507 | */ |
|---|
| 508 | gdprintk(XENLOG_INFO, "Warning: huge timeout set by domain %d " |
|---|
| 509 | "(vcpu %d): %"PRIx64"\n", |
|---|
| 510 | v->domain->domain_id, v->vcpu_id, (uint64_t)timeout); |
|---|
| 511 | set_timer(&v->singleshot_timer, NOW() + MILLISECS(100)); |
|---|
| 512 | } |
|---|
| 513 | else |
|---|
| 514 | { |
|---|
| 515 | if ( v->singleshot_timer.cpu != smp_processor_id() ) |
|---|
| 516 | { |
|---|
| 517 | stop_timer(&v->singleshot_timer); |
|---|
| 518 | v->singleshot_timer.cpu = smp_processor_id(); |
|---|
| 519 | } |
|---|
| 520 | |
|---|
| 521 | set_timer(&v->singleshot_timer, timeout); |
|---|
| 522 | } |
|---|
| 523 | |
|---|
| 524 | return 0; |
|---|
| 525 | } |
|---|
| 526 | |
|---|
| 527 | /* sched_id - fetch ID of current scheduler */ |
|---|
| 528 | int sched_id(void) |
|---|
| 529 | { |
|---|
| 530 | return ops.sched_id; |
|---|
| 531 | } |
|---|
| 532 | |
|---|
| 533 | /* Adjust scheduling parameter for a given domain. */ |
|---|
| 534 | long sched_adjust(struct domain *d, struct xen_domctl_scheduler_op *op) |
|---|
| 535 | { |
|---|
| 536 | struct vcpu *v; |
|---|
| 537 | long ret; |
|---|
| 538 | |
|---|
| 539 | if ( (op->sched_id != ops.sched_id) || |
|---|
| 540 | ((op->cmd != XEN_DOMCTL_SCHEDOP_putinfo) && |
|---|
| 541 | (op->cmd != XEN_DOMCTL_SCHEDOP_getinfo)) ) |
|---|
| 542 | return -EINVAL; |
|---|
| 543 | |
|---|
| 544 | /* |
|---|
| 545 | * Most VCPUs we can simply pause. If we are adjusting this VCPU then |
|---|
| 546 | * we acquire the local schedule_lock to guard against concurrent updates. |
|---|
| 547 | * |
|---|
| 548 | * We only acquire the local schedule lock after we have paused all other |
|---|
| 549 | * VCPUs in this domain. There are two reasons for this: |
|---|
| 550 | * 1- We don't want to hold up interrupts as pausing a VCPU can |
|---|
| 551 | * trigger a tlb shootdown. |
|---|
| 552 | * 2- Pausing other VCPUs involves briefly locking the schedule |
|---|
| 553 | * lock of the CPU they are running on. This CPU could be the |
|---|
| 554 | * same as ours. |
|---|
| 555 | */ |
|---|
| 556 | |
|---|
| 557 | for_each_vcpu ( d, v ) |
|---|
| 558 | { |
|---|
| 559 | if ( v != current ) |
|---|
| 560 | vcpu_pause(v); |
|---|
| 561 | } |
|---|
| 562 | |
|---|
| 563 | if ( d == current->domain ) |
|---|
| 564 | vcpu_schedule_lock_irq(current); |
|---|
| 565 | |
|---|
| 566 | if ( (ret = SCHED_OP(adjust, d, op)) == 0 ) |
|---|
| 567 | TRACE_1D(TRC_SCHED_ADJDOM, d->domain_id); |
|---|
| 568 | |
|---|
| 569 | if ( d == current->domain ) |
|---|
| 570 | vcpu_schedule_unlock_irq(current); |
|---|
| 571 | |
|---|
| 572 | for_each_vcpu ( d, v ) |
|---|
| 573 | { |
|---|
| 574 | if ( v != current ) |
|---|
| 575 | vcpu_unpause(v); |
|---|
| 576 | } |
|---|
| 577 | |
|---|
| 578 | return ret; |
|---|
| 579 | } |
|---|
| 580 | |
|---|
| 581 | static void vcpu_periodic_timer_work(struct vcpu *v) |
|---|
| 582 | { |
|---|
| 583 | s_time_t now = NOW(); |
|---|
| 584 | uint64_t periodic_next_event; |
|---|
| 585 | |
|---|
| 586 | ASSERT(!active_timer(&v->periodic_timer)); |
|---|
| 587 | |
|---|
| 588 | if ( v->periodic_period == 0 ) |
|---|
| 589 | return; |
|---|
| 590 | |
|---|
| 591 | periodic_next_event = v->periodic_last_event + v->periodic_period; |
|---|
| 592 | if ( now > periodic_next_event ) |
|---|
| 593 | { |
|---|
| 594 | send_timer_event(v); |
|---|
| 595 | v->periodic_last_event = now; |
|---|
| 596 | periodic_next_event = now + v->periodic_period; |
|---|
| 597 | } |
|---|
| 598 | |
|---|
| 599 | v->periodic_timer.cpu = smp_processor_id(); |
|---|
| 600 | set_timer(&v->periodic_timer, periodic_next_event); |
|---|
| 601 | } |
|---|
| 602 | |
|---|
| 603 | /* |
|---|
| 604 | * The main function |
|---|
| 605 | * - deschedule the current domain (scheduler independent). |
|---|
| 606 | * - pick a new domain (scheduler dependent). |
|---|
| 607 | */ |
|---|
| 608 | static void schedule(void) |
|---|
| 609 | { |
|---|
| 610 | struct vcpu *prev = current, *next = NULL; |
|---|
| 611 | s_time_t now = NOW(); |
|---|
| 612 | struct schedule_data *sd; |
|---|
| 613 | struct task_slice next_slice; |
|---|
| 614 | s32 r_time; /* time for new dom to run */ |
|---|
| 615 | |
|---|
| 616 | ASSERT(!in_irq()); |
|---|
| 617 | ASSERT(this_cpu(mc_state).flags == 0); |
|---|
| 618 | |
|---|
| 619 | perfc_incr(sched_run); |
|---|
| 620 | |
|---|
| 621 | sd = &this_cpu(schedule_data); |
|---|
| 622 | |
|---|
| 623 | spin_lock_irq(&sd->schedule_lock); |
|---|
| 624 | |
|---|
| 625 | stop_timer(&sd->s_timer); |
|---|
| 626 | |
|---|
| 627 | /* get policy-specific decision on scheduling... */ |
|---|
| 628 | next_slice = ops.do_schedule(now); |
|---|
| 629 | |
|---|
| 630 | r_time = next_slice.time; |
|---|
| 631 | next = next_slice.task; |
|---|
| 632 | |
|---|
| 633 | sd->curr = next; |
|---|
| 634 | |
|---|
| 635 | set_timer(&sd->s_timer, now + r_time); |
|---|
| 636 | |
|---|
| 637 | if ( unlikely(prev == next) ) |
|---|
| 638 | { |
|---|
| 639 | spin_unlock_irq(&sd->schedule_lock); |
|---|
| 640 | return continue_running(prev); |
|---|
| 641 | } |
|---|
| 642 | |
|---|
| 643 | TRACE_2D(TRC_SCHED_SWITCH_INFPREV, |
|---|
| 644 | prev->domain->domain_id, |
|---|
| 645 | now - prev->runstate.state_entry_time); |
|---|
| 646 | TRACE_3D(TRC_SCHED_SWITCH_INFNEXT, |
|---|
| 647 | next->domain->domain_id, |
|---|
| 648 | (next->runstate.state == RUNSTATE_runnable) ? |
|---|
| 649 | (now - next->runstate.state_entry_time) : 0, |
|---|
| 650 | r_time); |
|---|
| 651 | |
|---|
| 652 | ASSERT(prev->runstate.state == RUNSTATE_running); |
|---|
| 653 | vcpu_runstate_change( |
|---|
| 654 | prev, |
|---|
| 655 | (test_bit(_VPF_blocked, &prev->pause_flags) ? RUNSTATE_blocked : |
|---|
| 656 | (vcpu_runnable(prev) ? RUNSTATE_runnable : RUNSTATE_offline)), |
|---|
| 657 | now); |
|---|
| 658 | |
|---|
| 659 | ASSERT(next->runstate.state != RUNSTATE_running); |
|---|
| 660 | vcpu_runstate_change(next, RUNSTATE_running, now); |
|---|
| 661 | |
|---|
| 662 | ASSERT(!next->is_running); |
|---|
| 663 | next->is_running = 1; |
|---|
| 664 | |
|---|
| 665 | spin_unlock_irq(&sd->schedule_lock); |
|---|
| 666 | |
|---|
| 667 | perfc_incr(sched_ctx); |
|---|
| 668 | |
|---|
| 669 | stop_timer(&prev->periodic_timer); |
|---|
| 670 | |
|---|
| 671 | /* Ensure that the domain has an up-to-date time base. */ |
|---|
| 672 | update_vcpu_system_time(next); |
|---|
| 673 | vcpu_periodic_timer_work(next); |
|---|
| 674 | |
|---|
| 675 | TRACE_4D(TRC_SCHED_SWITCH, |
|---|
| 676 | prev->domain->domain_id, prev->vcpu_id, |
|---|
| 677 | next->domain->domain_id, next->vcpu_id); |
|---|
| 678 | |
|---|
| 679 | context_switch(prev, next); |
|---|
| 680 | } |
|---|
| 681 | |
|---|
| 682 | void context_saved(struct vcpu *prev) |
|---|
| 683 | { |
|---|
| 684 | /* Clear running flag /after/ writing context to memory. */ |
|---|
| 685 | smp_wmb(); |
|---|
| 686 | |
|---|
| 687 | prev->is_running = 0; |
|---|
| 688 | |
|---|
| 689 | /* Check for migration request /after/ clearing running flag. */ |
|---|
| 690 | smp_mb(); |
|---|
| 691 | |
|---|
| 692 | if ( unlikely(test_bit(_VPF_migrating, &prev->pause_flags)) ) |
|---|
| 693 | vcpu_migrate(prev); |
|---|
| 694 | } |
|---|
| 695 | |
|---|
| 696 | /* The scheduler timer: force a run through the scheduler */ |
|---|
| 697 | static void s_timer_fn(void *unused) |
|---|
| 698 | { |
|---|
| 699 | raise_softirq(SCHEDULE_SOFTIRQ); |
|---|
| 700 | perfc_incr(sched_irq); |
|---|
| 701 | } |
|---|
| 702 | |
|---|
| 703 | /* Per-VCPU periodic timer function: sends a virtual timer interrupt. */ |
|---|
| 704 | static void vcpu_periodic_timer_fn(void *data) |
|---|
| 705 | { |
|---|
| 706 | struct vcpu *v = data; |
|---|
| 707 | vcpu_periodic_timer_work(v); |
|---|
| 708 | } |
|---|
| 709 | |
|---|
| 710 | /* Per-VCPU single-shot timer function: sends a virtual timer interrupt. */ |
|---|
| 711 | static void vcpu_singleshot_timer_fn(void *data) |
|---|
| 712 | { |
|---|
| 713 | struct vcpu *v = data; |
|---|
| 714 | send_timer_event(v); |
|---|
| 715 | } |
|---|
| 716 | |
|---|
| 717 | /* SCHEDOP_poll timeout callback. */ |
|---|
| 718 | static void poll_timer_fn(void *data) |
|---|
| 719 | { |
|---|
| 720 | struct vcpu *v = data; |
|---|
| 721 | |
|---|
| 722 | if ( !v->is_polling ) |
|---|
| 723 | return; |
|---|
| 724 | |
|---|
| 725 | v->is_polling = 0; |
|---|
| 726 | vcpu_unblock(v); |
|---|
| 727 | } |
|---|
| 728 | |
|---|
| 729 | /* Initialise the data structures. */ |
|---|
| 730 | void __init scheduler_init(void) |
|---|
| 731 | { |
|---|
| 732 | int i; |
|---|
| 733 | |
|---|
| 734 | open_softirq(SCHEDULE_SOFTIRQ, schedule); |
|---|
| 735 | |
|---|
| 736 | for_each_cpu ( i ) |
|---|
| 737 | { |
|---|
| 738 | spin_lock_init(&per_cpu(schedule_data, i).schedule_lock); |
|---|
| 739 | init_timer(&per_cpu(schedule_data, i).s_timer, s_timer_fn, NULL, i); |
|---|
| 740 | } |
|---|
| 741 | |
|---|
| 742 | for ( i = 0; schedulers[i] != NULL; i++ ) |
|---|
| 743 | { |
|---|
| 744 | ops = *schedulers[i]; |
|---|
| 745 | if ( strcmp(ops.opt_name, opt_sched) == 0 ) |
|---|
| 746 | break; |
|---|
| 747 | } |
|---|
| 748 | |
|---|
| 749 | if ( schedulers[i] == NULL ) |
|---|
| 750 | printk("Could not find scheduler: %s\n", opt_sched); |
|---|
| 751 | |
|---|
| 752 | printk("Using scheduler: %s (%s)\n", ops.name, ops.opt_name); |
|---|
| 753 | SCHED_OP(init); |
|---|
| 754 | } |
|---|
| 755 | |
|---|
| 756 | void dump_runq(unsigned char key) |
|---|
| 757 | { |
|---|
| 758 | s_time_t now = NOW(); |
|---|
| 759 | int i; |
|---|
| 760 | unsigned long flags; |
|---|
| 761 | |
|---|
| 762 | local_irq_save(flags); |
|---|
| 763 | |
|---|
| 764 | printk("Scheduler: %s (%s)\n", ops.name, ops.opt_name); |
|---|
| 765 | SCHED_OP(dump_settings); |
|---|
| 766 | printk("NOW=0x%08X%08X\n", (u32)(now>>32), (u32)now); |
|---|
| 767 | |
|---|
| 768 | for_each_online_cpu ( i ) |
|---|
| 769 | { |
|---|
| 770 | spin_lock(&per_cpu(schedule_data, i).schedule_lock); |
|---|
| 771 | printk("CPU[%02d] ", i); |
|---|
| 772 | SCHED_OP(dump_cpu_state, i); |
|---|
| 773 | spin_unlock(&per_cpu(schedule_data, i).schedule_lock); |
|---|
| 774 | } |
|---|
| 775 | |
|---|
| 776 | local_irq_restore(flags); |
|---|
| 777 | } |
|---|
| 778 | |
|---|
| 779 | #ifdef CONFIG_COMPAT |
|---|
| 780 | #include "compat/schedule.c" |
|---|
| 781 | #endif |
|---|
| 782 | |
|---|
| 783 | #endif /* !COMPAT */ |
|---|
| 784 | |
|---|
| 785 | /* |
|---|
| 786 | * Local variables: |
|---|
| 787 | * mode: C |
|---|
| 788 | * c-set-style: "BSD" |
|---|
| 789 | * c-basic-offset: 4 |
|---|
| 790 | * tab-width: 4 |
|---|
| 791 | * indent-tabs-mode: nil |
|---|
| 792 | * End: |
|---|
| 793 | */ |
|---|