1 | /* |
---|
2 | * linux/arch/i386/kernel/process.c |
---|
3 | * |
---|
4 | * Copyright (C) 1995 Linus Torvalds |
---|
5 | * |
---|
6 | * Pentium III FXSR, SSE support |
---|
7 | * Gareth Hughes <gareth@valinux.com>, May 2000 |
---|
8 | */ |
---|
9 | |
---|
10 | /* |
---|
11 | * This file handles the architecture-dependent parts of process handling.. |
---|
12 | */ |
---|
13 | |
---|
14 | #include <stdarg.h> |
---|
15 | |
---|
16 | #include <linux/cpu.h> |
---|
17 | #include <linux/errno.h> |
---|
18 | #include <linux/sched.h> |
---|
19 | #include <linux/fs.h> |
---|
20 | #include <linux/kernel.h> |
---|
21 | #include <linux/mm.h> |
---|
22 | #include <linux/elfcore.h> |
---|
23 | #include <linux/smp.h> |
---|
24 | #include <linux/smp_lock.h> |
---|
25 | #include <linux/stddef.h> |
---|
26 | #include <linux/slab.h> |
---|
27 | #include <linux/vmalloc.h> |
---|
28 | #include <linux/user.h> |
---|
29 | #include <linux/a.out.h> |
---|
30 | #include <linux/interrupt.h> |
---|
31 | #include <linux/utsname.h> |
---|
32 | #include <linux/delay.h> |
---|
33 | #include <linux/reboot.h> |
---|
34 | #include <linux/init.h> |
---|
35 | #include <linux/mc146818rtc.h> |
---|
36 | #include <linux/module.h> |
---|
37 | #include <linux/kallsyms.h> |
---|
38 | #include <linux/ptrace.h> |
---|
39 | #include <linux/random.h> |
---|
40 | |
---|
41 | #include <asm/uaccess.h> |
---|
42 | #include <asm/pgtable.h> |
---|
43 | #include <asm/system.h> |
---|
44 | #include <asm/io.h> |
---|
45 | #include <asm/ldt.h> |
---|
46 | #include <asm/processor.h> |
---|
47 | #include <asm/i387.h> |
---|
48 | #include <asm/desc.h> |
---|
49 | #include <asm/vm86.h> |
---|
50 | #ifdef CONFIG_MATH_EMULATION |
---|
51 | #include <asm/math_emu.h> |
---|
52 | #endif |
---|
53 | |
---|
54 | #include <xen/interface/physdev.h> |
---|
55 | #include <xen/interface/vcpu.h> |
---|
56 | #include <xen/cpu_hotplug.h> |
---|
57 | |
---|
58 | #include <linux/err.h> |
---|
59 | |
---|
60 | #include <asm/tlbflush.h> |
---|
61 | #include <asm/cpu.h> |
---|
62 | |
---|
63 | asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); |
---|
64 | |
---|
65 | static int hlt_counter; |
---|
66 | |
---|
67 | unsigned long boot_option_idle_override = 0; |
---|
68 | EXPORT_SYMBOL(boot_option_idle_override); |
---|
69 | |
---|
70 | /* |
---|
71 | * Return saved PC of a blocked thread. |
---|
72 | */ |
---|
73 | unsigned long thread_saved_pc(struct task_struct *tsk) |
---|
74 | { |
---|
75 | return ((unsigned long *)tsk->thread.esp)[3]; |
---|
76 | } |
---|
77 | |
---|
78 | /* |
---|
79 | * Powermanagement idle function, if any.. |
---|
80 | */ |
---|
81 | void (*pm_idle)(void); |
---|
82 | EXPORT_SYMBOL(pm_idle); |
---|
83 | static DEFINE_PER_CPU(unsigned int, cpu_idle_state); |
---|
84 | |
---|
85 | void disable_hlt(void) |
---|
86 | { |
---|
87 | hlt_counter++; |
---|
88 | } |
---|
89 | |
---|
90 | EXPORT_SYMBOL(disable_hlt); |
---|
91 | |
---|
92 | void enable_hlt(void) |
---|
93 | { |
---|
94 | hlt_counter--; |
---|
95 | } |
---|
96 | |
---|
97 | EXPORT_SYMBOL(enable_hlt); |
---|
98 | |
---|
99 | /* |
---|
100 | * On SMP it's slightly faster (but much more power-consuming!) |
---|
101 | * to poll the ->work.need_resched flag instead of waiting for the |
---|
102 | * cross-CPU IPI to arrive. Use this option with caution. |
---|
103 | */ |
---|
104 | static void poll_idle (void) |
---|
105 | { |
---|
106 | local_irq_enable(); |
---|
107 | |
---|
108 | asm volatile( |
---|
109 | "2:" |
---|
110 | "testl %0, %1;" |
---|
111 | "rep; nop;" |
---|
112 | "je 2b;" |
---|
113 | : : "i"(_TIF_NEED_RESCHED), "m" (current_thread_info()->flags)); |
---|
114 | } |
---|
115 | |
---|
116 | static void xen_idle(void) |
---|
117 | { |
---|
118 | local_irq_disable(); |
---|
119 | |
---|
120 | if (need_resched()) |
---|
121 | local_irq_enable(); |
---|
122 | else { |
---|
123 | current_thread_info()->status &= ~TS_POLLING; |
---|
124 | smp_mb__after_clear_bit(); |
---|
125 | safe_halt(); |
---|
126 | current_thread_info()->status |= TS_POLLING; |
---|
127 | } |
---|
128 | } |
---|
129 | #ifdef CONFIG_APM_MODULE |
---|
130 | EXPORT_SYMBOL(default_idle); |
---|
131 | #endif |
---|
132 | |
---|
133 | #ifdef CONFIG_HOTPLUG_CPU |
---|
134 | extern cpumask_t cpu_initialized; |
---|
135 | static inline void play_dead(void) |
---|
136 | { |
---|
137 | idle_task_exit(); |
---|
138 | local_irq_disable(); |
---|
139 | cpu_clear(smp_processor_id(), cpu_initialized); |
---|
140 | preempt_enable_no_resched(); |
---|
141 | HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL); |
---|
142 | cpu_bringup(); |
---|
143 | } |
---|
144 | #else |
---|
145 | static inline void play_dead(void) |
---|
146 | { |
---|
147 | BUG(); |
---|
148 | } |
---|
149 | #endif /* CONFIG_HOTPLUG_CPU */ |
---|
150 | |
---|
151 | /* |
---|
152 | * The idle thread. There's no useful work to be |
---|
153 | * done, so just try to conserve power and have a |
---|
154 | * low exit latency (ie sit in a loop waiting for |
---|
155 | * somebody to say that they'd like to reschedule) |
---|
156 | */ |
---|
157 | void cpu_idle(void) |
---|
158 | { |
---|
159 | int cpu = smp_processor_id(); |
---|
160 | |
---|
161 | current_thread_info()->status |= TS_POLLING; |
---|
162 | |
---|
163 | /* endless idle loop with no priority at all */ |
---|
164 | while (1) { |
---|
165 | while (!need_resched()) { |
---|
166 | void (*idle)(void); |
---|
167 | |
---|
168 | if (__get_cpu_var(cpu_idle_state)) |
---|
169 | __get_cpu_var(cpu_idle_state) = 0; |
---|
170 | |
---|
171 | rmb(); |
---|
172 | idle = xen_idle; /* no alternatives */ |
---|
173 | |
---|
174 | if (cpu_is_offline(cpu)) |
---|
175 | play_dead(); |
---|
176 | |
---|
177 | __get_cpu_var(irq_stat).idle_timestamp = jiffies; |
---|
178 | idle(); |
---|
179 | } |
---|
180 | preempt_enable_no_resched(); |
---|
181 | schedule(); |
---|
182 | preempt_disable(); |
---|
183 | } |
---|
184 | } |
---|
185 | |
---|
186 | void cpu_idle_wait(void) |
---|
187 | { |
---|
188 | unsigned int cpu, this_cpu = get_cpu(); |
---|
189 | cpumask_t map; |
---|
190 | |
---|
191 | set_cpus_allowed(current, cpumask_of_cpu(this_cpu)); |
---|
192 | put_cpu(); |
---|
193 | |
---|
194 | cpus_clear(map); |
---|
195 | for_each_online_cpu(cpu) { |
---|
196 | per_cpu(cpu_idle_state, cpu) = 1; |
---|
197 | cpu_set(cpu, map); |
---|
198 | } |
---|
199 | |
---|
200 | __get_cpu_var(cpu_idle_state) = 0; |
---|
201 | |
---|
202 | wmb(); |
---|
203 | do { |
---|
204 | ssleep(1); |
---|
205 | for_each_online_cpu(cpu) { |
---|
206 | if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu)) |
---|
207 | cpu_clear(cpu, map); |
---|
208 | } |
---|
209 | cpus_and(map, map, cpu_online_map); |
---|
210 | } while (!cpus_empty(map)); |
---|
211 | } |
---|
212 | EXPORT_SYMBOL_GPL(cpu_idle_wait); |
---|
213 | |
---|
214 | void __devinit select_idle_routine(const struct cpuinfo_x86 *c) |
---|
215 | { |
---|
216 | } |
---|
217 | |
---|
218 | static int __init idle_setup (char *str) |
---|
219 | { |
---|
220 | if (!strncmp(str, "poll", 4)) { |
---|
221 | printk("using polling idle threads.\n"); |
---|
222 | pm_idle = poll_idle; |
---|
223 | } |
---|
224 | |
---|
225 | boot_option_idle_override = 1; |
---|
226 | return 1; |
---|
227 | } |
---|
228 | |
---|
229 | __setup("idle=", idle_setup); |
---|
230 | |
---|
231 | void show_regs(struct pt_regs * regs) |
---|
232 | { |
---|
233 | unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; |
---|
234 | |
---|
235 | printk("\n"); |
---|
236 | printk("Pid: %d, comm: %20s\n", current->pid, current->comm); |
---|
237 | printk("EIP: %04x:[<%08lx>] CPU: %d\n",0xffff & regs->xcs,regs->eip, smp_processor_id()); |
---|
238 | print_symbol("EIP is at %s\n", regs->eip); |
---|
239 | |
---|
240 | if (user_mode_vm(regs)) |
---|
241 | printk(" ESP: %04x:%08lx",0xffff & regs->xss,regs->esp); |
---|
242 | printk(" EFLAGS: %08lx %s (%s %.*s)\n", |
---|
243 | regs->eflags, print_tainted(), system_utsname.release, |
---|
244 | (int)strcspn(system_utsname.version, " "), |
---|
245 | system_utsname.version); |
---|
246 | printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", |
---|
247 | regs->eax,regs->ebx,regs->ecx,regs->edx); |
---|
248 | printk("ESI: %08lx EDI: %08lx EBP: %08lx", |
---|
249 | regs->esi, regs->edi, regs->ebp); |
---|
250 | printk(" DS: %04x ES: %04x\n", |
---|
251 | 0xffff & regs->xds,0xffff & regs->xes); |
---|
252 | |
---|
253 | cr0 = read_cr0(); |
---|
254 | cr2 = read_cr2(); |
---|
255 | cr3 = read_cr3(); |
---|
256 | cr4 = read_cr4_safe(); |
---|
257 | printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0, cr2, cr3, cr4); |
---|
258 | show_trace(NULL, regs, ®s->esp); |
---|
259 | } |
---|
260 | |
---|
261 | /* |
---|
262 | * This gets run with %ebx containing the |
---|
263 | * function to call, and %edx containing |
---|
264 | * the "args". |
---|
265 | */ |
---|
266 | extern void kernel_thread_helper(void); |
---|
267 | __asm__(".section .text\n" |
---|
268 | ".align 4\n" |
---|
269 | "kernel_thread_helper:\n\t" |
---|
270 | "movl %edx,%eax\n\t" |
---|
271 | "pushl %edx\n\t" |
---|
272 | "call *%ebx\n\t" |
---|
273 | "pushl %eax\n\t" |
---|
274 | "call do_exit\n" |
---|
275 | ".previous"); |
---|
276 | |
---|
277 | /* |
---|
278 | * Create a kernel thread |
---|
279 | */ |
---|
280 | int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) |
---|
281 | { |
---|
282 | struct pt_regs regs; |
---|
283 | |
---|
284 | memset(®s, 0, sizeof(regs)); |
---|
285 | |
---|
286 | regs.ebx = (unsigned long) fn; |
---|
287 | regs.edx = (unsigned long) arg; |
---|
288 | |
---|
289 | regs.xds = __USER_DS; |
---|
290 | regs.xes = __USER_DS; |
---|
291 | regs.orig_eax = -1; |
---|
292 | regs.eip = (unsigned long) kernel_thread_helper; |
---|
293 | regs.xcs = GET_KERNEL_CS(); |
---|
294 | regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2; |
---|
295 | |
---|
296 | /* Ok, create the new process.. */ |
---|
297 | return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, ®s, 0, NULL, NULL); |
---|
298 | } |
---|
299 | EXPORT_SYMBOL(kernel_thread); |
---|
300 | |
---|
301 | /* |
---|
302 | * Free current thread data structures etc.. |
---|
303 | */ |
---|
304 | void exit_thread(void) |
---|
305 | { |
---|
306 | /* The process may have allocated an io port bitmap... nuke it. */ |
---|
307 | if (unlikely(test_thread_flag(TIF_IO_BITMAP))) { |
---|
308 | struct task_struct *tsk = current; |
---|
309 | struct thread_struct *t = &tsk->thread; |
---|
310 | struct physdev_set_iobitmap set_iobitmap; |
---|
311 | memset(&set_iobitmap, 0, sizeof(set_iobitmap)); |
---|
312 | HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap, &set_iobitmap); |
---|
313 | kfree(t->io_bitmap_ptr); |
---|
314 | t->io_bitmap_ptr = NULL; |
---|
315 | clear_thread_flag(TIF_IO_BITMAP); |
---|
316 | } |
---|
317 | } |
---|
318 | |
---|
319 | void flush_thread(void) |
---|
320 | { |
---|
321 | struct task_struct *tsk = current; |
---|
322 | |
---|
323 | memset(tsk->thread.debugreg, 0, sizeof(unsigned long)*8); |
---|
324 | memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); |
---|
325 | clear_tsk_thread_flag(tsk, TIF_DEBUG); |
---|
326 | /* |
---|
327 | * Forget coprocessor state.. |
---|
328 | */ |
---|
329 | clear_fpu(tsk); |
---|
330 | clear_used_math(); |
---|
331 | } |
---|
332 | |
---|
333 | void release_thread(struct task_struct *dead_task) |
---|
334 | { |
---|
335 | BUG_ON(dead_task->mm); |
---|
336 | release_vm86_irqs(dead_task); |
---|
337 | } |
---|
338 | |
---|
339 | /* |
---|
340 | * This gets called before we allocate a new thread and copy |
---|
341 | * the current task into it. |
---|
342 | */ |
---|
343 | void prepare_to_copy(struct task_struct *tsk) |
---|
344 | { |
---|
345 | unlazy_fpu(tsk); |
---|
346 | } |
---|
347 | |
---|
348 | int copy_thread(int nr, unsigned long clone_flags, unsigned long esp, |
---|
349 | unsigned long unused, |
---|
350 | struct task_struct * p, struct pt_regs * regs) |
---|
351 | { |
---|
352 | struct pt_regs * childregs; |
---|
353 | struct task_struct *tsk; |
---|
354 | int err; |
---|
355 | |
---|
356 | childregs = task_pt_regs(p); |
---|
357 | *childregs = *regs; |
---|
358 | childregs->eax = 0; |
---|
359 | childregs->esp = esp; |
---|
360 | |
---|
361 | p->thread.esp = (unsigned long) childregs; |
---|
362 | p->thread.esp0 = (unsigned long) (childregs+1); |
---|
363 | |
---|
364 | p->thread.eip = (unsigned long) ret_from_fork; |
---|
365 | |
---|
366 | savesegment(fs,p->thread.fs); |
---|
367 | savesegment(gs,p->thread.gs); |
---|
368 | |
---|
369 | tsk = current; |
---|
370 | if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) { |
---|
371 | p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); |
---|
372 | if (!p->thread.io_bitmap_ptr) { |
---|
373 | p->thread.io_bitmap_max = 0; |
---|
374 | return -ENOMEM; |
---|
375 | } |
---|
376 | memcpy(p->thread.io_bitmap_ptr, tsk->thread.io_bitmap_ptr, |
---|
377 | IO_BITMAP_BYTES); |
---|
378 | set_tsk_thread_flag(p, TIF_IO_BITMAP); |
---|
379 | } |
---|
380 | |
---|
381 | /* |
---|
382 | * Set a new TLS for the child thread? |
---|
383 | */ |
---|
384 | if (clone_flags & CLONE_SETTLS) { |
---|
385 | struct desc_struct *desc; |
---|
386 | struct user_desc info; |
---|
387 | int idx; |
---|
388 | |
---|
389 | err = -EFAULT; |
---|
390 | if (copy_from_user(&info, (void __user *)childregs->esi, sizeof(info))) |
---|
391 | goto out; |
---|
392 | err = -EINVAL; |
---|
393 | if (LDT_empty(&info)) |
---|
394 | goto out; |
---|
395 | |
---|
396 | idx = info.entry_number; |
---|
397 | if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) |
---|
398 | goto out; |
---|
399 | |
---|
400 | desc = p->thread.tls_array + idx - GDT_ENTRY_TLS_MIN; |
---|
401 | desc->a = LDT_entry_a(&info); |
---|
402 | desc->b = LDT_entry_b(&info); |
---|
403 | } |
---|
404 | |
---|
405 | p->thread.iopl = current->thread.iopl; |
---|
406 | |
---|
407 | err = 0; |
---|
408 | out: |
---|
409 | if (err && p->thread.io_bitmap_ptr) { |
---|
410 | kfree(p->thread.io_bitmap_ptr); |
---|
411 | p->thread.io_bitmap_max = 0; |
---|
412 | } |
---|
413 | return err; |
---|
414 | } |
---|
415 | |
---|
416 | /* |
---|
417 | * fill in the user structure for a core dump.. |
---|
418 | */ |
---|
419 | void dump_thread(struct pt_regs * regs, struct user * dump) |
---|
420 | { |
---|
421 | int i; |
---|
422 | |
---|
423 | /* changed the size calculations - should hopefully work better. lbt */ |
---|
424 | dump->magic = CMAGIC; |
---|
425 | dump->start_code = 0; |
---|
426 | dump->start_stack = regs->esp & ~(PAGE_SIZE - 1); |
---|
427 | dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT; |
---|
428 | dump->u_dsize = ((unsigned long) (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT; |
---|
429 | dump->u_dsize -= dump->u_tsize; |
---|
430 | dump->u_ssize = 0; |
---|
431 | for (i = 0; i < 8; i++) |
---|
432 | dump->u_debugreg[i] = current->thread.debugreg[i]; |
---|
433 | |
---|
434 | if (dump->start_stack < TASK_SIZE) |
---|
435 | dump->u_ssize = ((unsigned long) (TASK_SIZE - dump->start_stack)) >> PAGE_SHIFT; |
---|
436 | |
---|
437 | dump->regs.ebx = regs->ebx; |
---|
438 | dump->regs.ecx = regs->ecx; |
---|
439 | dump->regs.edx = regs->edx; |
---|
440 | dump->regs.esi = regs->esi; |
---|
441 | dump->regs.edi = regs->edi; |
---|
442 | dump->regs.ebp = regs->ebp; |
---|
443 | dump->regs.eax = regs->eax; |
---|
444 | dump->regs.ds = regs->xds; |
---|
445 | dump->regs.es = regs->xes; |
---|
446 | savesegment(fs,dump->regs.fs); |
---|
447 | savesegment(gs,dump->regs.gs); |
---|
448 | dump->regs.orig_eax = regs->orig_eax; |
---|
449 | dump->regs.eip = regs->eip; |
---|
450 | dump->regs.cs = regs->xcs; |
---|
451 | dump->regs.eflags = regs->eflags; |
---|
452 | dump->regs.esp = regs->esp; |
---|
453 | dump->regs.ss = regs->xss; |
---|
454 | |
---|
455 | dump->u_fpvalid = dump_fpu (regs, &dump->i387); |
---|
456 | } |
---|
457 | EXPORT_SYMBOL(dump_thread); |
---|
458 | |
---|
459 | /* |
---|
460 | * Capture the user space registers if the task is not running (in user space) |
---|
461 | */ |
---|
462 | int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs) |
---|
463 | { |
---|
464 | struct pt_regs ptregs = *task_pt_regs(tsk); |
---|
465 | ptregs.xcs &= 0xffff; |
---|
466 | ptregs.xds &= 0xffff; |
---|
467 | ptregs.xes &= 0xffff; |
---|
468 | ptregs.xss &= 0xffff; |
---|
469 | |
---|
470 | elf_core_copy_regs(regs, &ptregs); |
---|
471 | |
---|
472 | return 1; |
---|
473 | } |
---|
474 | |
---|
475 | static noinline void __switch_to_xtra(struct task_struct *next_p) |
---|
476 | { |
---|
477 | struct thread_struct *next; |
---|
478 | |
---|
479 | next = &next_p->thread; |
---|
480 | |
---|
481 | if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { |
---|
482 | set_debugreg(next->debugreg[0], 0); |
---|
483 | set_debugreg(next->debugreg[1], 1); |
---|
484 | set_debugreg(next->debugreg[2], 2); |
---|
485 | set_debugreg(next->debugreg[3], 3); |
---|
486 | /* no 4 and 5 */ |
---|
487 | set_debugreg(next->debugreg[6], 6); |
---|
488 | set_debugreg(next->debugreg[7], 7); |
---|
489 | } |
---|
490 | } |
---|
491 | |
---|
492 | /* |
---|
493 | * This function selects if the context switch from prev to next |
---|
494 | * has to tweak the TSC disable bit in the cr4. |
---|
495 | */ |
---|
496 | static inline void disable_tsc(struct task_struct *prev_p, |
---|
497 | struct task_struct *next_p) |
---|
498 | { |
---|
499 | struct thread_info *prev, *next; |
---|
500 | |
---|
501 | /* |
---|
502 | * gcc should eliminate the ->thread_info dereference if |
---|
503 | * has_secure_computing returns 0 at compile time (SECCOMP=n). |
---|
504 | */ |
---|
505 | prev = task_thread_info(prev_p); |
---|
506 | next = task_thread_info(next_p); |
---|
507 | |
---|
508 | if (has_secure_computing(prev) || has_secure_computing(next)) { |
---|
509 | /* slow path here */ |
---|
510 | if (has_secure_computing(prev) && |
---|
511 | !has_secure_computing(next)) { |
---|
512 | write_cr4(read_cr4() & ~X86_CR4_TSD); |
---|
513 | } else if (!has_secure_computing(prev) && |
---|
514 | has_secure_computing(next)) |
---|
515 | write_cr4(read_cr4() | X86_CR4_TSD); |
---|
516 | } |
---|
517 | } |
---|
518 | |
---|
519 | /* |
---|
520 | * switch_to(x,yn) should switch tasks from x to y. |
---|
521 | * |
---|
522 | * We fsave/fwait so that an exception goes off at the right time |
---|
523 | * (as a call from the fsave or fwait in effect) rather than to |
---|
524 | * the wrong process. Lazy FP saving no longer makes any sense |
---|
525 | * with modern CPU's, and this simplifies a lot of things (SMP |
---|
526 | * and UP become the same). |
---|
527 | * |
---|
528 | * NOTE! We used to use the x86 hardware context switching. The |
---|
529 | * reason for not using it any more becomes apparent when you |
---|
530 | * try to recover gracefully from saved state that is no longer |
---|
531 | * valid (stale segment register values in particular). With the |
---|
532 | * hardware task-switch, there is no way to fix up bad state in |
---|
533 | * a reasonable manner. |
---|
534 | * |
---|
535 | * The fact that Intel documents the hardware task-switching to |
---|
536 | * be slow is a fairly red herring - this code is not noticeably |
---|
537 | * faster. However, there _is_ some room for improvement here, |
---|
538 | * so the performance issues may eventually be a valid point. |
---|
539 | * More important, however, is the fact that this allows us much |
---|
540 | * more flexibility. |
---|
541 | * |
---|
542 | * The return value (in %eax) will be the "prev" task after |
---|
543 | * the task-switch, and shows up in ret_from_fork in entry.S, |
---|
544 | * for example. |
---|
545 | */ |
---|
546 | struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct task_struct *next_p) |
---|
547 | { |
---|
548 | struct thread_struct *prev = &prev_p->thread, |
---|
549 | *next = &next_p->thread; |
---|
550 | int cpu = smp_processor_id(); |
---|
551 | #ifndef CONFIG_X86_NO_TSS |
---|
552 | struct tss_struct *tss = &per_cpu(init_tss, cpu); |
---|
553 | #endif |
---|
554 | struct physdev_set_iopl iopl_op; |
---|
555 | struct physdev_set_iobitmap iobmp_op; |
---|
556 | multicall_entry_t _mcl[8], *mcl = _mcl; |
---|
557 | |
---|
558 | /* XEN NOTE: FS/GS saved in switch_mm(), not here. */ |
---|
559 | |
---|
560 | /* |
---|
561 | * This is basically '__unlazy_fpu', except that we queue a |
---|
562 | * multicall to indicate FPU task switch, rather than |
---|
563 | * synchronously trapping to Xen. |
---|
564 | */ |
---|
565 | if (prev_p->thread_info->status & TS_USEDFPU) { |
---|
566 | __save_init_fpu(prev_p); /* _not_ save_init_fpu() */ |
---|
567 | mcl->op = __HYPERVISOR_fpu_taskswitch; |
---|
568 | mcl->args[0] = 1; |
---|
569 | mcl++; |
---|
570 | } |
---|
571 | #if 0 /* lazy fpu sanity check */ |
---|
572 | else BUG_ON(!(read_cr0() & 8)); |
---|
573 | #endif |
---|
574 | |
---|
575 | /* |
---|
576 | * Reload esp0. |
---|
577 | * This is load_esp0(tss, next) with a multicall. |
---|
578 | */ |
---|
579 | mcl->op = __HYPERVISOR_stack_switch; |
---|
580 | mcl->args[0] = __KERNEL_DS; |
---|
581 | mcl->args[1] = next->esp0; |
---|
582 | mcl++; |
---|
583 | |
---|
584 | /* |
---|
585 | * Load the per-thread Thread-Local Storage descriptor. |
---|
586 | * This is load_TLS(next, cpu) with multicalls. |
---|
587 | */ |
---|
588 | #define C(i) do { \ |
---|
589 | if (unlikely(next->tls_array[i].a != prev->tls_array[i].a || \ |
---|
590 | next->tls_array[i].b != prev->tls_array[i].b)) { \ |
---|
591 | mcl->op = __HYPERVISOR_update_descriptor; \ |
---|
592 | *(u64 *)&mcl->args[0] = virt_to_machine( \ |
---|
593 | &get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]);\ |
---|
594 | *(u64 *)&mcl->args[2] = *(u64 *)&next->tls_array[i]; \ |
---|
595 | mcl++; \ |
---|
596 | } \ |
---|
597 | } while (0) |
---|
598 | C(0); C(1); C(2); |
---|
599 | #undef C |
---|
600 | |
---|
601 | if (unlikely(prev->iopl != next->iopl)) { |
---|
602 | iopl_op.iopl = (next->iopl == 0) ? 1 : (next->iopl >> 12) & 3; |
---|
603 | mcl->op = __HYPERVISOR_physdev_op; |
---|
604 | mcl->args[0] = PHYSDEVOP_set_iopl; |
---|
605 | mcl->args[1] = (unsigned long)&iopl_op; |
---|
606 | mcl++; |
---|
607 | } |
---|
608 | |
---|
609 | if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) { |
---|
610 | set_xen_guest_handle(iobmp_op.bitmap, |
---|
611 | (char *)next->io_bitmap_ptr); |
---|
612 | iobmp_op.nr_ports = next->io_bitmap_ptr ? IO_BITMAP_BITS : 0; |
---|
613 | mcl->op = __HYPERVISOR_physdev_op; |
---|
614 | mcl->args[0] = PHYSDEVOP_set_iobitmap; |
---|
615 | mcl->args[1] = (unsigned long)&iobmp_op; |
---|
616 | mcl++; |
---|
617 | } |
---|
618 | |
---|
619 | (void)HYPERVISOR_multicall(_mcl, mcl - _mcl); |
---|
620 | |
---|
621 | /* |
---|
622 | * Restore %fs and %gs if needed. |
---|
623 | * |
---|
624 | * Glibc normally makes %fs be zero, and %gs is one of |
---|
625 | * the TLS segments. |
---|
626 | */ |
---|
627 | if (unlikely(next->fs)) |
---|
628 | loadsegment(fs, next->fs); |
---|
629 | |
---|
630 | if (next->gs) |
---|
631 | loadsegment(gs, next->gs); |
---|
632 | |
---|
633 | /* |
---|
634 | * Now maybe handle debug registers |
---|
635 | */ |
---|
636 | if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW)) |
---|
637 | __switch_to_xtra(next_p); |
---|
638 | |
---|
639 | disable_tsc(prev_p, next_p); |
---|
640 | |
---|
641 | return prev_p; |
---|
642 | } |
---|
643 | |
---|
644 | asmlinkage int sys_fork(struct pt_regs regs) |
---|
645 | { |
---|
646 | return do_fork(SIGCHLD, regs.esp, ®s, 0, NULL, NULL); |
---|
647 | } |
---|
648 | |
---|
649 | asmlinkage int sys_clone(struct pt_regs regs) |
---|
650 | { |
---|
651 | unsigned long clone_flags; |
---|
652 | unsigned long newsp; |
---|
653 | int __user *parent_tidptr, *child_tidptr; |
---|
654 | |
---|
655 | clone_flags = regs.ebx; |
---|
656 | newsp = regs.ecx; |
---|
657 | parent_tidptr = (int __user *)regs.edx; |
---|
658 | child_tidptr = (int __user *)regs.edi; |
---|
659 | if (!newsp) |
---|
660 | newsp = regs.esp; |
---|
661 | return do_fork(clone_flags, newsp, ®s, 0, parent_tidptr, child_tidptr); |
---|
662 | } |
---|
663 | |
---|
664 | /* |
---|
665 | * This is trivial, and on the face of it looks like it |
---|
666 | * could equally well be done in user mode. |
---|
667 | * |
---|
668 | * Not so, for quite unobvious reasons - register pressure. |
---|
669 | * In user mode vfork() cannot have a stack frame, and if |
---|
670 | * done by calling the "clone()" system call directly, you |
---|
671 | * do not have enough call-clobbered registers to hold all |
---|
672 | * the information you need. |
---|
673 | */ |
---|
674 | asmlinkage int sys_vfork(struct pt_regs regs) |
---|
675 | { |
---|
676 | return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.esp, ®s, 0, NULL, NULL); |
---|
677 | } |
---|
678 | |
---|
679 | /* |
---|
680 | * sys_execve() executes a new program. |
---|
681 | */ |
---|
682 | asmlinkage int sys_execve(struct pt_regs regs) |
---|
683 | { |
---|
684 | int error; |
---|
685 | char * filename; |
---|
686 | |
---|
687 | filename = getname((char __user *) regs.ebx); |
---|
688 | error = PTR_ERR(filename); |
---|
689 | if (IS_ERR(filename)) |
---|
690 | goto out; |
---|
691 | error = do_execve(filename, |
---|
692 | (char __user * __user *) regs.ecx, |
---|
693 | (char __user * __user *) regs.edx, |
---|
694 | ®s); |
---|
695 | if (error == 0) { |
---|
696 | task_lock(current); |
---|
697 | current->ptrace &= ~PT_DTRACE; |
---|
698 | task_unlock(current); |
---|
699 | /* Make sure we don't return using sysenter.. */ |
---|
700 | set_thread_flag(TIF_IRET); |
---|
701 | } |
---|
702 | putname(filename); |
---|
703 | out: |
---|
704 | return error; |
---|
705 | } |
---|
706 | |
---|
707 | #define top_esp (THREAD_SIZE - sizeof(unsigned long)) |
---|
708 | #define top_ebp (THREAD_SIZE - 2*sizeof(unsigned long)) |
---|
709 | |
---|
710 | unsigned long get_wchan(struct task_struct *p) |
---|
711 | { |
---|
712 | unsigned long ebp, esp, eip; |
---|
713 | unsigned long stack_page; |
---|
714 | int count = 0; |
---|
715 | if (!p || p == current || p->state == TASK_RUNNING) |
---|
716 | return 0; |
---|
717 | stack_page = (unsigned long)task_stack_page(p); |
---|
718 | esp = p->thread.esp; |
---|
719 | if (!stack_page || esp < stack_page || esp > top_esp+stack_page) |
---|
720 | return 0; |
---|
721 | /* include/asm-i386/system.h:switch_to() pushes ebp last. */ |
---|
722 | ebp = *(unsigned long *) esp; |
---|
723 | do { |
---|
724 | if (ebp < stack_page || ebp > top_ebp+stack_page) |
---|
725 | return 0; |
---|
726 | eip = *(unsigned long *) (ebp+4); |
---|
727 | if (!in_sched_functions(eip)) |
---|
728 | return eip; |
---|
729 | ebp = *(unsigned long *) ebp; |
---|
730 | } while (count++ < 16); |
---|
731 | return 0; |
---|
732 | } |
---|
733 | |
---|
734 | /* |
---|
735 | * sys_alloc_thread_area: get a yet unused TLS descriptor index. |
---|
736 | */ |
---|
737 | static int get_free_idx(void) |
---|
738 | { |
---|
739 | struct thread_struct *t = ¤t->thread; |
---|
740 | int idx; |
---|
741 | |
---|
742 | for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++) |
---|
743 | if (desc_empty(t->tls_array + idx)) |
---|
744 | return idx + GDT_ENTRY_TLS_MIN; |
---|
745 | return -ESRCH; |
---|
746 | } |
---|
747 | |
---|
748 | /* |
---|
749 | * Set a given TLS descriptor: |
---|
750 | */ |
---|
751 | asmlinkage int sys_set_thread_area(struct user_desc __user *u_info) |
---|
752 | { |
---|
753 | struct thread_struct *t = ¤t->thread; |
---|
754 | struct user_desc info; |
---|
755 | struct desc_struct *desc; |
---|
756 | int cpu, idx; |
---|
757 | |
---|
758 | if (copy_from_user(&info, u_info, sizeof(info))) |
---|
759 | return -EFAULT; |
---|
760 | idx = info.entry_number; |
---|
761 | |
---|
762 | /* |
---|
763 | * index -1 means the kernel should try to find and |
---|
764 | * allocate an empty descriptor: |
---|
765 | */ |
---|
766 | if (idx == -1) { |
---|
767 | idx = get_free_idx(); |
---|
768 | if (idx < 0) |
---|
769 | return idx; |
---|
770 | if (put_user(idx, &u_info->entry_number)) |
---|
771 | return -EFAULT; |
---|
772 | } |
---|
773 | |
---|
774 | if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) |
---|
775 | return -EINVAL; |
---|
776 | |
---|
777 | desc = t->tls_array + idx - GDT_ENTRY_TLS_MIN; |
---|
778 | |
---|
779 | /* |
---|
780 | * We must not get preempted while modifying the TLS. |
---|
781 | */ |
---|
782 | cpu = get_cpu(); |
---|
783 | |
---|
784 | if (LDT_empty(&info)) { |
---|
785 | desc->a = 0; |
---|
786 | desc->b = 0; |
---|
787 | } else { |
---|
788 | desc->a = LDT_entry_a(&info); |
---|
789 | desc->b = LDT_entry_b(&info); |
---|
790 | } |
---|
791 | load_TLS(t, cpu); |
---|
792 | |
---|
793 | put_cpu(); |
---|
794 | |
---|
795 | return 0; |
---|
796 | } |
---|
797 | |
---|
798 | /* |
---|
799 | * Get the current Thread-Local Storage area: |
---|
800 | */ |
---|
801 | |
---|
802 | #define GET_BASE(desc) ( \ |
---|
803 | (((desc)->a >> 16) & 0x0000ffff) | \ |
---|
804 | (((desc)->b << 16) & 0x00ff0000) | \ |
---|
805 | ( (desc)->b & 0xff000000) ) |
---|
806 | |
---|
807 | #define GET_LIMIT(desc) ( \ |
---|
808 | ((desc)->a & 0x0ffff) | \ |
---|
809 | ((desc)->b & 0xf0000) ) |
---|
810 | |
---|
811 | #define GET_32BIT(desc) (((desc)->b >> 22) & 1) |
---|
812 | #define GET_CONTENTS(desc) (((desc)->b >> 10) & 3) |
---|
813 | #define GET_WRITABLE(desc) (((desc)->b >> 9) & 1) |
---|
814 | #define GET_LIMIT_PAGES(desc) (((desc)->b >> 23) & 1) |
---|
815 | #define GET_PRESENT(desc) (((desc)->b >> 15) & 1) |
---|
816 | #define GET_USEABLE(desc) (((desc)->b >> 20) & 1) |
---|
817 | |
---|
818 | asmlinkage int sys_get_thread_area(struct user_desc __user *u_info) |
---|
819 | { |
---|
820 | struct user_desc info; |
---|
821 | struct desc_struct *desc; |
---|
822 | int idx; |
---|
823 | |
---|
824 | if (get_user(idx, &u_info->entry_number)) |
---|
825 | return -EFAULT; |
---|
826 | if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) |
---|
827 | return -EINVAL; |
---|
828 | |
---|
829 | memset(&info, 0, sizeof(info)); |
---|
830 | |
---|
831 | desc = current->thread.tls_array + idx - GDT_ENTRY_TLS_MIN; |
---|
832 | |
---|
833 | info.entry_number = idx; |
---|
834 | info.base_addr = GET_BASE(desc); |
---|
835 | info.limit = GET_LIMIT(desc); |
---|
836 | info.seg_32bit = GET_32BIT(desc); |
---|
837 | info.contents = GET_CONTENTS(desc); |
---|
838 | info.read_exec_only = !GET_WRITABLE(desc); |
---|
839 | info.limit_in_pages = GET_LIMIT_PAGES(desc); |
---|
840 | info.seg_not_present = !GET_PRESENT(desc); |
---|
841 | info.useable = GET_USEABLE(desc); |
---|
842 | |
---|
843 | if (copy_to_user(u_info, &info, sizeof(info))) |
---|
844 | return -EFAULT; |
---|
845 | return 0; |
---|
846 | } |
---|
847 | |
---|
848 | unsigned long arch_align_stack(unsigned long sp) |
---|
849 | { |
---|
850 | if (randomize_va_space) |
---|
851 | sp -= get_random_int() % 8192; |
---|
852 | return sp & ~0xf; |
---|
853 | } |
---|