1 | /**************************************************************************** |
---|
2 | * (C) 2002-2003 - Rolf Neugebauer - Intel Research Cambridge |
---|
3 | * (C) 2002-2003 University of Cambridge |
---|
4 | * (C) 2004 - Mark Williamson - Intel Research Cambridge |
---|
5 | **************************************************************************** |
---|
6 | * |
---|
7 | * File: common/schedule.c |
---|
8 | * Author: Rolf Neugebauer & Keir Fraser |
---|
9 | * Updated for generic API by Mark Williamson |
---|
10 | * |
---|
11 | * Description: Generic CPU scheduling code |
---|
12 | * implements support functionality for the Xen scheduler API. |
---|
13 | * |
---|
14 | */ |
---|
15 | |
---|
16 | #ifndef COMPAT |
---|
17 | #include <xen/config.h> |
---|
18 | #include <xen/init.h> |
---|
19 | #include <xen/lib.h> |
---|
20 | #include <xen/sched.h> |
---|
21 | #include <xen/domain.h> |
---|
22 | #include <xen/delay.h> |
---|
23 | #include <xen/event.h> |
---|
24 | #include <xen/time.h> |
---|
25 | #include <xen/timer.h> |
---|
26 | #include <xen/perfc.h> |
---|
27 | #include <xen/sched-if.h> |
---|
28 | #include <xen/softirq.h> |
---|
29 | #include <xen/trace.h> |
---|
30 | #include <xen/mm.h> |
---|
31 | #include <xen/errno.h> |
---|
32 | #include <xen/guest_access.h> |
---|
33 | #include <xen/multicall.h> |
---|
34 | #include <public/sched.h> |
---|
35 | |
---|
36 | /* opt_sched: scheduler - default to credit */ |
---|
37 | static char opt_sched[10] = "credit"; |
---|
38 | string_param("sched", opt_sched); |
---|
39 | |
---|
40 | /* opt_dom0_vcpus_pin: If true, dom0 VCPUs are pinned. */ |
---|
41 | static unsigned int opt_dom0_vcpus_pin; |
---|
42 | boolean_param("dom0_vcpus_pin", opt_dom0_vcpus_pin); |
---|
43 | |
---|
44 | #define TIME_SLOP (s32)MICROSECS(50) /* allow time to slip a bit */ |
---|
45 | |
---|
46 | /* Various timer handlers. */ |
---|
47 | static void s_timer_fn(void *unused); |
---|
48 | static void vcpu_periodic_timer_fn(void *data); |
---|
49 | static void vcpu_singleshot_timer_fn(void *data); |
---|
50 | static void poll_timer_fn(void *data); |
---|
51 | |
---|
52 | /* This is global for now so that private implementations can reach it */ |
---|
53 | DEFINE_PER_CPU(struct schedule_data, schedule_data); |
---|
54 | |
---|
55 | extern struct scheduler sched_sedf_def; |
---|
56 | extern struct scheduler sched_credit_def; |
---|
57 | static struct scheduler *schedulers[] = { |
---|
58 | &sched_sedf_def, |
---|
59 | &sched_credit_def, |
---|
60 | NULL |
---|
61 | }; |
---|
62 | |
---|
63 | static struct scheduler ops; |
---|
64 | |
---|
65 | #define SCHED_OP(fn, ...) \ |
---|
66 | (( ops.fn != NULL ) ? ops.fn( __VA_ARGS__ ) \ |
---|
67 | : (typeof(ops.fn(__VA_ARGS__)))0 ) |
---|
68 | |
---|
69 | static inline void vcpu_runstate_change( |
---|
70 | struct vcpu *v, int new_state, s_time_t new_entry_time) |
---|
71 | { |
---|
72 | ASSERT(v->runstate.state != new_state); |
---|
73 | ASSERT(spin_is_locked(&per_cpu(schedule_data,v->processor).schedule_lock)); |
---|
74 | |
---|
75 | v->runstate.time[v->runstate.state] += |
---|
76 | new_entry_time - v->runstate.state_entry_time; |
---|
77 | v->runstate.state_entry_time = new_entry_time; |
---|
78 | v->runstate.state = new_state; |
---|
79 | } |
---|
80 | |
---|
81 | void vcpu_runstate_get(struct vcpu *v, struct vcpu_runstate_info *runstate) |
---|
82 | { |
---|
83 | if ( likely(v == current) ) |
---|
84 | { |
---|
85 | /* Fast lock-free path. */ |
---|
86 | memcpy(runstate, &v->runstate, sizeof(*runstate)); |
---|
87 | ASSERT(runstate->state == RUNSTATE_running); |
---|
88 | runstate->time[RUNSTATE_running] += NOW() - runstate->state_entry_time; |
---|
89 | } |
---|
90 | else |
---|
91 | { |
---|
92 | vcpu_schedule_lock_irq(v); |
---|
93 | memcpy(runstate, &v->runstate, sizeof(*runstate)); |
---|
94 | runstate->time[runstate->state] += NOW() - runstate->state_entry_time; |
---|
95 | vcpu_schedule_unlock_irq(v); |
---|
96 | } |
---|
97 | } |
---|
98 | |
---|
99 | int sched_init_vcpu(struct vcpu *v, unsigned int processor) |
---|
100 | { |
---|
101 | struct domain *d = v->domain; |
---|
102 | |
---|
103 | /* |
---|
104 | * Initialize processor and affinity settings. The idler, and potentially |
---|
105 | * domain-0 VCPUs, are pinned onto their respective physical CPUs. |
---|
106 | */ |
---|
107 | v->processor = processor; |
---|
108 | if ( is_idle_domain(d) || ((d->domain_id == 0) && opt_dom0_vcpus_pin) ) |
---|
109 | v->cpu_affinity = cpumask_of_cpu(processor); |
---|
110 | else |
---|
111 | cpus_setall(v->cpu_affinity); |
---|
112 | |
---|
113 | /* Initialise the per-vcpu timers. */ |
---|
114 | init_timer(&v->periodic_timer, vcpu_periodic_timer_fn, |
---|
115 | v, v->processor); |
---|
116 | init_timer(&v->singleshot_timer, vcpu_singleshot_timer_fn, |
---|
117 | v, v->processor); |
---|
118 | init_timer(&v->poll_timer, poll_timer_fn, |
---|
119 | v, v->processor); |
---|
120 | |
---|
121 | /* Idle VCPUs are scheduled immediately. */ |
---|
122 | if ( is_idle_domain(d) ) |
---|
123 | { |
---|
124 | per_cpu(schedule_data, v->processor).curr = v; |
---|
125 | per_cpu(schedule_data, v->processor).idle = v; |
---|
126 | v->is_running = 1; |
---|
127 | } |
---|
128 | |
---|
129 | TRACE_2D(TRC_SCHED_DOM_ADD, v->domain->domain_id, v->vcpu_id); |
---|
130 | |
---|
131 | return SCHED_OP(init_vcpu, v); |
---|
132 | } |
---|
133 | |
---|
134 | void sched_destroy_vcpu(struct vcpu *v) |
---|
135 | { |
---|
136 | kill_timer(&v->periodic_timer); |
---|
137 | kill_timer(&v->singleshot_timer); |
---|
138 | kill_timer(&v->poll_timer); |
---|
139 | SCHED_OP(destroy_vcpu, v); |
---|
140 | } |
---|
141 | |
---|
142 | int sched_init_domain(struct domain *d) |
---|
143 | { |
---|
144 | return SCHED_OP(init_domain, d); |
---|
145 | } |
---|
146 | |
---|
147 | void sched_destroy_domain(struct domain *d) |
---|
148 | { |
---|
149 | SCHED_OP(destroy_domain, d); |
---|
150 | } |
---|
151 | |
---|
152 | void vcpu_sleep_nosync(struct vcpu *v) |
---|
153 | { |
---|
154 | unsigned long flags; |
---|
155 | |
---|
156 | vcpu_schedule_lock_irqsave(v, flags); |
---|
157 | |
---|
158 | if ( likely(!vcpu_runnable(v)) ) |
---|
159 | { |
---|
160 | if ( v->runstate.state == RUNSTATE_runnable ) |
---|
161 | vcpu_runstate_change(v, RUNSTATE_offline, NOW()); |
---|
162 | |
---|
163 | SCHED_OP(sleep, v); |
---|
164 | } |
---|
165 | |
---|
166 | vcpu_schedule_unlock_irqrestore(v, flags); |
---|
167 | |
---|
168 | TRACE_2D(TRC_SCHED_SLEEP, v->domain->domain_id, v->vcpu_id); |
---|
169 | } |
---|
170 | |
---|
171 | void vcpu_sleep_sync(struct vcpu *v) |
---|
172 | { |
---|
173 | vcpu_sleep_nosync(v); |
---|
174 | |
---|
175 | while ( !vcpu_runnable(v) && v->is_running ) |
---|
176 | cpu_relax(); |
---|
177 | |
---|
178 | sync_vcpu_execstate(v); |
---|
179 | } |
---|
180 | |
---|
181 | void vcpu_wake(struct vcpu *v) |
---|
182 | { |
---|
183 | unsigned long flags; |
---|
184 | |
---|
185 | vcpu_schedule_lock_irqsave(v, flags); |
---|
186 | |
---|
187 | if ( likely(vcpu_runnable(v)) ) |
---|
188 | { |
---|
189 | if ( v->runstate.state >= RUNSTATE_blocked ) |
---|
190 | vcpu_runstate_change(v, RUNSTATE_runnable, NOW()); |
---|
191 | SCHED_OP(wake, v); |
---|
192 | } |
---|
193 | else if ( !test_bit(_VPF_blocked, &v->pause_flags) ) |
---|
194 | { |
---|
195 | if ( v->runstate.state == RUNSTATE_blocked ) |
---|
196 | vcpu_runstate_change(v, RUNSTATE_offline, NOW()); |
---|
197 | } |
---|
198 | |
---|
199 | vcpu_schedule_unlock_irqrestore(v, flags); |
---|
200 | |
---|
201 | TRACE_2D(TRC_SCHED_WAKE, v->domain->domain_id, v->vcpu_id); |
---|
202 | } |
---|
203 | |
---|
204 | static void vcpu_migrate(struct vcpu *v) |
---|
205 | { |
---|
206 | unsigned long flags; |
---|
207 | int old_cpu; |
---|
208 | |
---|
209 | vcpu_schedule_lock_irqsave(v, flags); |
---|
210 | |
---|
211 | /* |
---|
212 | * NB. Check of v->running happens /after/ setting migration flag |
---|
213 | * because they both happen in (different) spinlock regions, and those |
---|
214 | * regions are strictly serialised. |
---|
215 | */ |
---|
216 | if ( v->is_running || |
---|
217 | !test_and_clear_bit(_VPF_migrating, &v->pause_flags) ) |
---|
218 | { |
---|
219 | vcpu_schedule_unlock_irqrestore(v, flags); |
---|
220 | return; |
---|
221 | } |
---|
222 | |
---|
223 | /* Switch to new CPU, then unlock old CPU. */ |
---|
224 | old_cpu = v->processor; |
---|
225 | v->processor = SCHED_OP(pick_cpu, v); |
---|
226 | spin_unlock_irqrestore( |
---|
227 | &per_cpu(schedule_data, old_cpu).schedule_lock, flags); |
---|
228 | |
---|
229 | /* Wake on new CPU. */ |
---|
230 | vcpu_wake(v); |
---|
231 | } |
---|
232 | |
---|
233 | /* |
---|
234 | * Force a VCPU through a deschedule/reschedule path. |
---|
235 | * For example, using this when setting the periodic timer period means that |
---|
236 | * most periodic-timer state need only be touched from within the scheduler |
---|
237 | * which can thus be done without need for synchronisation. |
---|
238 | */ |
---|
239 | void vcpu_force_reschedule(struct vcpu *v) |
---|
240 | { |
---|
241 | vcpu_schedule_lock_irq(v); |
---|
242 | if ( v->is_running ) |
---|
243 | set_bit(_VPF_migrating, &v->pause_flags); |
---|
244 | vcpu_schedule_unlock_irq(v); |
---|
245 | |
---|
246 | if ( test_bit(_VPF_migrating, &v->pause_flags) ) |
---|
247 | { |
---|
248 | vcpu_sleep_nosync(v); |
---|
249 | vcpu_migrate(v); |
---|
250 | } |
---|
251 | } |
---|
252 | |
---|
253 | int vcpu_set_affinity(struct vcpu *v, cpumask_t *affinity) |
---|
254 | { |
---|
255 | cpumask_t online_affinity; |
---|
256 | |
---|
257 | if ( (v->domain->domain_id == 0) && opt_dom0_vcpus_pin ) |
---|
258 | return -EINVAL; |
---|
259 | |
---|
260 | cpus_and(online_affinity, *affinity, cpu_online_map); |
---|
261 | if ( cpus_empty(online_affinity) ) |
---|
262 | return -EINVAL; |
---|
263 | |
---|
264 | vcpu_schedule_lock_irq(v); |
---|
265 | |
---|
266 | v->cpu_affinity = *affinity; |
---|
267 | if ( !cpu_isset(v->processor, v->cpu_affinity) ) |
---|
268 | set_bit(_VPF_migrating, &v->pause_flags); |
---|
269 | |
---|
270 | vcpu_schedule_unlock_irq(v); |
---|
271 | |
---|
272 | if ( test_bit(_VPF_migrating, &v->pause_flags) ) |
---|
273 | { |
---|
274 | vcpu_sleep_nosync(v); |
---|
275 | vcpu_migrate(v); |
---|
276 | } |
---|
277 | |
---|
278 | return 0; |
---|
279 | } |
---|
280 | |
---|
281 | /* Block the currently-executing domain until a pertinent event occurs. */ |
---|
282 | static long do_block(void) |
---|
283 | { |
---|
284 | struct vcpu *v = current; |
---|
285 | |
---|
286 | local_event_delivery_enable(); |
---|
287 | set_bit(_VPF_blocked, &v->pause_flags); |
---|
288 | |
---|
289 | /* Check for events /after/ blocking: avoids wakeup waiting race. */ |
---|
290 | if ( local_events_need_delivery() ) |
---|
291 | { |
---|
292 | clear_bit(_VPF_blocked, &v->pause_flags); |
---|
293 | } |
---|
294 | else |
---|
295 | { |
---|
296 | TRACE_2D(TRC_SCHED_BLOCK, v->domain->domain_id, v->vcpu_id); |
---|
297 | raise_softirq(SCHEDULE_SOFTIRQ); |
---|
298 | } |
---|
299 | |
---|
300 | return 0; |
---|
301 | } |
---|
302 | |
---|
303 | static long do_poll(struct sched_poll *sched_poll) |
---|
304 | { |
---|
305 | struct vcpu *v = current; |
---|
306 | struct domain *d = v->domain; |
---|
307 | evtchn_port_t port; |
---|
308 | long rc = 0; |
---|
309 | unsigned int i; |
---|
310 | |
---|
311 | /* Fairly arbitrary limit. */ |
---|
312 | if ( sched_poll->nr_ports > 128 ) |
---|
313 | return -EINVAL; |
---|
314 | |
---|
315 | if ( !guest_handle_okay(sched_poll->ports, sched_poll->nr_ports) ) |
---|
316 | return -EFAULT; |
---|
317 | |
---|
318 | set_bit(_VPF_blocked, &v->pause_flags); |
---|
319 | v->is_polling = 1; |
---|
320 | d->is_polling = 1; |
---|
321 | |
---|
322 | /* Check for events /after/ setting flags: avoids wakeup waiting race. */ |
---|
323 | smp_wmb(); |
---|
324 | |
---|
325 | for ( i = 0; i < sched_poll->nr_ports; i++ ) |
---|
326 | { |
---|
327 | rc = -EFAULT; |
---|
328 | if ( __copy_from_guest_offset(&port, sched_poll->ports, i, 1) ) |
---|
329 | goto out; |
---|
330 | |
---|
331 | rc = -EINVAL; |
---|
332 | if ( port >= MAX_EVTCHNS(d) ) |
---|
333 | goto out; |
---|
334 | |
---|
335 | rc = 0; |
---|
336 | if ( test_bit(port, shared_info_addr(d, evtchn_pending)) ) |
---|
337 | goto out; |
---|
338 | } |
---|
339 | |
---|
340 | if ( sched_poll->timeout != 0 ) |
---|
341 | set_timer(&v->poll_timer, sched_poll->timeout); |
---|
342 | |
---|
343 | TRACE_2D(TRC_SCHED_BLOCK, d->domain_id, v->vcpu_id); |
---|
344 | raise_softirq(SCHEDULE_SOFTIRQ); |
---|
345 | |
---|
346 | return 0; |
---|
347 | |
---|
348 | out: |
---|
349 | v->is_polling = 0; |
---|
350 | clear_bit(_VPF_blocked, &v->pause_flags); |
---|
351 | return rc; |
---|
352 | } |
---|
353 | |
---|
354 | /* Voluntarily yield the processor for this allocation. */ |
---|
355 | static long do_yield(void) |
---|
356 | { |
---|
357 | TRACE_2D(TRC_SCHED_YIELD, current->domain->domain_id, current->vcpu_id); |
---|
358 | raise_softirq(SCHEDULE_SOFTIRQ); |
---|
359 | return 0; |
---|
360 | } |
---|
361 | |
---|
362 | long do_sched_op_compat(int cmd, unsigned long arg) |
---|
363 | { |
---|
364 | long ret = 0; |
---|
365 | |
---|
366 | switch ( cmd ) |
---|
367 | { |
---|
368 | case SCHEDOP_yield: |
---|
369 | { |
---|
370 | ret = do_yield(); |
---|
371 | break; |
---|
372 | } |
---|
373 | |
---|
374 | case SCHEDOP_block: |
---|
375 | { |
---|
376 | ret = do_block(); |
---|
377 | break; |
---|
378 | } |
---|
379 | |
---|
380 | case SCHEDOP_shutdown: |
---|
381 | { |
---|
382 | TRACE_3D(TRC_SCHED_SHUTDOWN, |
---|
383 | current->domain->domain_id, current->vcpu_id, arg); |
---|
384 | domain_shutdown(current->domain, (u8)arg); |
---|
385 | break; |
---|
386 | } |
---|
387 | |
---|
388 | default: |
---|
389 | ret = -ENOSYS; |
---|
390 | } |
---|
391 | |
---|
392 | return ret; |
---|
393 | } |
---|
394 | |
---|
395 | typedef long ret_t; |
---|
396 | |
---|
397 | #endif /* !COMPAT */ |
---|
398 | |
---|
399 | ret_t do_sched_op(int cmd, XEN_GUEST_HANDLE(void) arg) |
---|
400 | { |
---|
401 | ret_t ret = 0; |
---|
402 | |
---|
403 | switch ( cmd ) |
---|
404 | { |
---|
405 | case SCHEDOP_yield: |
---|
406 | { |
---|
407 | ret = do_yield(); |
---|
408 | break; |
---|
409 | } |
---|
410 | |
---|
411 | case SCHEDOP_block: |
---|
412 | { |
---|
413 | ret = do_block(); |
---|
414 | break; |
---|
415 | } |
---|
416 | |
---|
417 | case SCHEDOP_shutdown: |
---|
418 | { |
---|
419 | struct sched_shutdown sched_shutdown; |
---|
420 | |
---|
421 | ret = -EFAULT; |
---|
422 | if ( copy_from_guest(&sched_shutdown, arg, 1) ) |
---|
423 | break; |
---|
424 | |
---|
425 | ret = 0; |
---|
426 | TRACE_3D(TRC_SCHED_SHUTDOWN, |
---|
427 | current->domain->domain_id, current->vcpu_id, |
---|
428 | sched_shutdown.reason); |
---|
429 | domain_shutdown(current->domain, (u8)sched_shutdown.reason); |
---|
430 | |
---|
431 | break; |
---|
432 | } |
---|
433 | |
---|
434 | case SCHEDOP_poll: |
---|
435 | { |
---|
436 | struct sched_poll sched_poll; |
---|
437 | |
---|
438 | ret = -EFAULT; |
---|
439 | if ( copy_from_guest(&sched_poll, arg, 1) ) |
---|
440 | break; |
---|
441 | |
---|
442 | ret = do_poll(&sched_poll); |
---|
443 | |
---|
444 | break; |
---|
445 | } |
---|
446 | |
---|
447 | case SCHEDOP_remote_shutdown: |
---|
448 | { |
---|
449 | struct domain *d; |
---|
450 | struct sched_remote_shutdown sched_remote_shutdown; |
---|
451 | |
---|
452 | if ( !IS_PRIV(current->domain) ) |
---|
453 | return -EPERM; |
---|
454 | |
---|
455 | ret = -EFAULT; |
---|
456 | if ( copy_from_guest(&sched_remote_shutdown, arg, 1) ) |
---|
457 | break; |
---|
458 | |
---|
459 | ret = -ESRCH; |
---|
460 | d = rcu_lock_domain_by_id(sched_remote_shutdown.domain_id); |
---|
461 | if ( d == NULL ) |
---|
462 | break; |
---|
463 | |
---|
464 | /* domain_pause() prevens any further execution in guest context. */ |
---|
465 | domain_pause(d); |
---|
466 | domain_shutdown(d, (u8)sched_remote_shutdown.reason); |
---|
467 | domain_unpause(d); |
---|
468 | |
---|
469 | rcu_unlock_domain(d); |
---|
470 | ret = 0; |
---|
471 | |
---|
472 | break; |
---|
473 | } |
---|
474 | |
---|
475 | default: |
---|
476 | ret = -ENOSYS; |
---|
477 | } |
---|
478 | |
---|
479 | return ret; |
---|
480 | } |
---|
481 | |
---|
482 | #ifndef COMPAT |
---|
483 | |
---|
484 | /* Per-vcpu oneshot-timer hypercall. */ |
---|
485 | long do_set_timer_op(s_time_t timeout) |
---|
486 | { |
---|
487 | struct vcpu *v = current; |
---|
488 | s_time_t offset = timeout - NOW(); |
---|
489 | |
---|
490 | if ( timeout == 0 ) |
---|
491 | { |
---|
492 | stop_timer(&v->singleshot_timer); |
---|
493 | } |
---|
494 | else if ( unlikely(timeout < 0) || /* overflow into 64th bit? */ |
---|
495 | unlikely((offset > 0) && ((uint32_t)(offset >> 50) != 0)) ) |
---|
496 | { |
---|
497 | /* |
---|
498 | * Linux workaround: occasionally we will see timeouts a long way in |
---|
499 | * the future due to wrapping in Linux's jiffy time handling. We check |
---|
500 | * for timeouts wrapped negative, and for positive timeouts more than |
---|
501 | * about 13 days in the future (2^50ns). The correct fix is to trigger |
---|
502 | * an interrupt immediately (since Linux in fact has pending work to |
---|
503 | * do in this situation). However, older guests also set a long timeout |
---|
504 | * when they have *no* pending timers at all: setting an immediate |
---|
505 | * timeout in this case can burn a lot of CPU. We therefore go for a |
---|
506 | * reasonable middleground of triggering a timer event in 100ms. |
---|
507 | */ |
---|
508 | gdprintk(XENLOG_INFO, "Warning: huge timeout set by domain %d " |
---|
509 | "(vcpu %d): %"PRIx64"\n", |
---|
510 | v->domain->domain_id, v->vcpu_id, (uint64_t)timeout); |
---|
511 | set_timer(&v->singleshot_timer, NOW() + MILLISECS(100)); |
---|
512 | } |
---|
513 | else |
---|
514 | { |
---|
515 | if ( v->singleshot_timer.cpu != smp_processor_id() ) |
---|
516 | { |
---|
517 | stop_timer(&v->singleshot_timer); |
---|
518 | v->singleshot_timer.cpu = smp_processor_id(); |
---|
519 | } |
---|
520 | |
---|
521 | set_timer(&v->singleshot_timer, timeout); |
---|
522 | } |
---|
523 | |
---|
524 | return 0; |
---|
525 | } |
---|
526 | |
---|
527 | /* sched_id - fetch ID of current scheduler */ |
---|
528 | int sched_id(void) |
---|
529 | { |
---|
530 | return ops.sched_id; |
---|
531 | } |
---|
532 | |
---|
533 | /* Adjust scheduling parameter for a given domain. */ |
---|
534 | long sched_adjust(struct domain *d, struct xen_domctl_scheduler_op *op) |
---|
535 | { |
---|
536 | struct vcpu *v; |
---|
537 | long ret; |
---|
538 | |
---|
539 | if ( (op->sched_id != ops.sched_id) || |
---|
540 | ((op->cmd != XEN_DOMCTL_SCHEDOP_putinfo) && |
---|
541 | (op->cmd != XEN_DOMCTL_SCHEDOP_getinfo)) ) |
---|
542 | return -EINVAL; |
---|
543 | |
---|
544 | /* |
---|
545 | * Most VCPUs we can simply pause. If we are adjusting this VCPU then |
---|
546 | * we acquire the local schedule_lock to guard against concurrent updates. |
---|
547 | * |
---|
548 | * We only acquire the local schedule lock after we have paused all other |
---|
549 | * VCPUs in this domain. There are two reasons for this: |
---|
550 | * 1- We don't want to hold up interrupts as pausing a VCPU can |
---|
551 | * trigger a tlb shootdown. |
---|
552 | * 2- Pausing other VCPUs involves briefly locking the schedule |
---|
553 | * lock of the CPU they are running on. This CPU could be the |
---|
554 | * same as ours. |
---|
555 | */ |
---|
556 | |
---|
557 | for_each_vcpu ( d, v ) |
---|
558 | { |
---|
559 | if ( v != current ) |
---|
560 | vcpu_pause(v); |
---|
561 | } |
---|
562 | |
---|
563 | if ( d == current->domain ) |
---|
564 | vcpu_schedule_lock_irq(current); |
---|
565 | |
---|
566 | if ( (ret = SCHED_OP(adjust, d, op)) == 0 ) |
---|
567 | TRACE_1D(TRC_SCHED_ADJDOM, d->domain_id); |
---|
568 | |
---|
569 | if ( d == current->domain ) |
---|
570 | vcpu_schedule_unlock_irq(current); |
---|
571 | |
---|
572 | for_each_vcpu ( d, v ) |
---|
573 | { |
---|
574 | if ( v != current ) |
---|
575 | vcpu_unpause(v); |
---|
576 | } |
---|
577 | |
---|
578 | return ret; |
---|
579 | } |
---|
580 | |
---|
581 | static void vcpu_periodic_timer_work(struct vcpu *v) |
---|
582 | { |
---|
583 | s_time_t now = NOW(); |
---|
584 | uint64_t periodic_next_event; |
---|
585 | |
---|
586 | ASSERT(!active_timer(&v->periodic_timer)); |
---|
587 | |
---|
588 | if ( v->periodic_period == 0 ) |
---|
589 | return; |
---|
590 | |
---|
591 | periodic_next_event = v->periodic_last_event + v->periodic_period; |
---|
592 | if ( now > periodic_next_event ) |
---|
593 | { |
---|
594 | send_timer_event(v); |
---|
595 | v->periodic_last_event = now; |
---|
596 | periodic_next_event = now + v->periodic_period; |
---|
597 | } |
---|
598 | |
---|
599 | v->periodic_timer.cpu = smp_processor_id(); |
---|
600 | set_timer(&v->periodic_timer, periodic_next_event); |
---|
601 | } |
---|
602 | |
---|
603 | /* |
---|
604 | * The main function |
---|
605 | * - deschedule the current domain (scheduler independent). |
---|
606 | * - pick a new domain (scheduler dependent). |
---|
607 | */ |
---|
608 | static void schedule(void) |
---|
609 | { |
---|
610 | struct vcpu *prev = current, *next = NULL; |
---|
611 | s_time_t now = NOW(); |
---|
612 | struct schedule_data *sd; |
---|
613 | struct task_slice next_slice; |
---|
614 | s32 r_time; /* time for new dom to run */ |
---|
615 | |
---|
616 | ASSERT(!in_irq()); |
---|
617 | ASSERT(this_cpu(mc_state).flags == 0); |
---|
618 | |
---|
619 | perfc_incr(sched_run); |
---|
620 | |
---|
621 | sd = &this_cpu(schedule_data); |
---|
622 | |
---|
623 | spin_lock_irq(&sd->schedule_lock); |
---|
624 | |
---|
625 | stop_timer(&sd->s_timer); |
---|
626 | |
---|
627 | /* get policy-specific decision on scheduling... */ |
---|
628 | next_slice = ops.do_schedule(now); |
---|
629 | |
---|
630 | r_time = next_slice.time; |
---|
631 | next = next_slice.task; |
---|
632 | |
---|
633 | sd->curr = next; |
---|
634 | |
---|
635 | set_timer(&sd->s_timer, now + r_time); |
---|
636 | |
---|
637 | if ( unlikely(prev == next) ) |
---|
638 | { |
---|
639 | spin_unlock_irq(&sd->schedule_lock); |
---|
640 | return continue_running(prev); |
---|
641 | } |
---|
642 | |
---|
643 | TRACE_2D(TRC_SCHED_SWITCH_INFPREV, |
---|
644 | prev->domain->domain_id, |
---|
645 | now - prev->runstate.state_entry_time); |
---|
646 | TRACE_3D(TRC_SCHED_SWITCH_INFNEXT, |
---|
647 | next->domain->domain_id, |
---|
648 | (next->runstate.state == RUNSTATE_runnable) ? |
---|
649 | (now - next->runstate.state_entry_time) : 0, |
---|
650 | r_time); |
---|
651 | |
---|
652 | ASSERT(prev->runstate.state == RUNSTATE_running); |
---|
653 | vcpu_runstate_change( |
---|
654 | prev, |
---|
655 | (test_bit(_VPF_blocked, &prev->pause_flags) ? RUNSTATE_blocked : |
---|
656 | (vcpu_runnable(prev) ? RUNSTATE_runnable : RUNSTATE_offline)), |
---|
657 | now); |
---|
658 | |
---|
659 | ASSERT(next->runstate.state != RUNSTATE_running); |
---|
660 | vcpu_runstate_change(next, RUNSTATE_running, now); |
---|
661 | |
---|
662 | ASSERT(!next->is_running); |
---|
663 | next->is_running = 1; |
---|
664 | |
---|
665 | spin_unlock_irq(&sd->schedule_lock); |
---|
666 | |
---|
667 | perfc_incr(sched_ctx); |
---|
668 | |
---|
669 | stop_timer(&prev->periodic_timer); |
---|
670 | |
---|
671 | /* Ensure that the domain has an up-to-date time base. */ |
---|
672 | update_vcpu_system_time(next); |
---|
673 | vcpu_periodic_timer_work(next); |
---|
674 | |
---|
675 | TRACE_4D(TRC_SCHED_SWITCH, |
---|
676 | prev->domain->domain_id, prev->vcpu_id, |
---|
677 | next->domain->domain_id, next->vcpu_id); |
---|
678 | |
---|
679 | context_switch(prev, next); |
---|
680 | } |
---|
681 | |
---|
682 | void context_saved(struct vcpu *prev) |
---|
683 | { |
---|
684 | /* Clear running flag /after/ writing context to memory. */ |
---|
685 | smp_wmb(); |
---|
686 | |
---|
687 | prev->is_running = 0; |
---|
688 | |
---|
689 | /* Check for migration request /after/ clearing running flag. */ |
---|
690 | smp_mb(); |
---|
691 | |
---|
692 | if ( unlikely(test_bit(_VPF_migrating, &prev->pause_flags)) ) |
---|
693 | vcpu_migrate(prev); |
---|
694 | } |
---|
695 | |
---|
696 | /* The scheduler timer: force a run through the scheduler */ |
---|
697 | static void s_timer_fn(void *unused) |
---|
698 | { |
---|
699 | raise_softirq(SCHEDULE_SOFTIRQ); |
---|
700 | perfc_incr(sched_irq); |
---|
701 | } |
---|
702 | |
---|
703 | /* Per-VCPU periodic timer function: sends a virtual timer interrupt. */ |
---|
704 | static void vcpu_periodic_timer_fn(void *data) |
---|
705 | { |
---|
706 | struct vcpu *v = data; |
---|
707 | vcpu_periodic_timer_work(v); |
---|
708 | } |
---|
709 | |
---|
710 | /* Per-VCPU single-shot timer function: sends a virtual timer interrupt. */ |
---|
711 | static void vcpu_singleshot_timer_fn(void *data) |
---|
712 | { |
---|
713 | struct vcpu *v = data; |
---|
714 | send_timer_event(v); |
---|
715 | } |
---|
716 | |
---|
717 | /* SCHEDOP_poll timeout callback. */ |
---|
718 | static void poll_timer_fn(void *data) |
---|
719 | { |
---|
720 | struct vcpu *v = data; |
---|
721 | |
---|
722 | if ( !v->is_polling ) |
---|
723 | return; |
---|
724 | |
---|
725 | v->is_polling = 0; |
---|
726 | vcpu_unblock(v); |
---|
727 | } |
---|
728 | |
---|
729 | /* Initialise the data structures. */ |
---|
730 | void __init scheduler_init(void) |
---|
731 | { |
---|
732 | int i; |
---|
733 | |
---|
734 | open_softirq(SCHEDULE_SOFTIRQ, schedule); |
---|
735 | |
---|
736 | for_each_cpu ( i ) |
---|
737 | { |
---|
738 | spin_lock_init(&per_cpu(schedule_data, i).schedule_lock); |
---|
739 | init_timer(&per_cpu(schedule_data, i).s_timer, s_timer_fn, NULL, i); |
---|
740 | } |
---|
741 | |
---|
742 | for ( i = 0; schedulers[i] != NULL; i++ ) |
---|
743 | { |
---|
744 | ops = *schedulers[i]; |
---|
745 | if ( strcmp(ops.opt_name, opt_sched) == 0 ) |
---|
746 | break; |
---|
747 | } |
---|
748 | |
---|
749 | if ( schedulers[i] == NULL ) |
---|
750 | printk("Could not find scheduler: %s\n", opt_sched); |
---|
751 | |
---|
752 | printk("Using scheduler: %s (%s)\n", ops.name, ops.opt_name); |
---|
753 | SCHED_OP(init); |
---|
754 | } |
---|
755 | |
---|
756 | void dump_runq(unsigned char key) |
---|
757 | { |
---|
758 | s_time_t now = NOW(); |
---|
759 | int i; |
---|
760 | unsigned long flags; |
---|
761 | |
---|
762 | local_irq_save(flags); |
---|
763 | |
---|
764 | printk("Scheduler: %s (%s)\n", ops.name, ops.opt_name); |
---|
765 | SCHED_OP(dump_settings); |
---|
766 | printk("NOW=0x%08X%08X\n", (u32)(now>>32), (u32)now); |
---|
767 | |
---|
768 | for_each_online_cpu ( i ) |
---|
769 | { |
---|
770 | spin_lock(&per_cpu(schedule_data, i).schedule_lock); |
---|
771 | printk("CPU[%02d] ", i); |
---|
772 | SCHED_OP(dump_cpu_state, i); |
---|
773 | spin_unlock(&per_cpu(schedule_data, i).schedule_lock); |
---|
774 | } |
---|
775 | |
---|
776 | local_irq_restore(flags); |
---|
777 | } |
---|
778 | |
---|
779 | #ifdef CONFIG_COMPAT |
---|
780 | #include "compat/schedule.c" |
---|
781 | #endif |
---|
782 | |
---|
783 | #endif /* !COMPAT */ |
---|
784 | |
---|
785 | /* |
---|
786 | * Local variables: |
---|
787 | * mode: C |
---|
788 | * c-set-style: "BSD" |
---|
789 | * c-basic-offset: 4 |
---|
790 | * tab-width: 4 |
---|
791 | * indent-tabs-mode: nil |
---|
792 | * End: |
---|
793 | */ |
---|