Please file new bugs on Launchpad: Invirt or XVM (if you're not sure which, just pick one)

Context Navigation

source: trunk/packages/xen-3.1/xen-3.1/linux-2.6-xen-sparse/kernel/kexec.c @ 34

Last change on this file since 34 was 34, checked in by hartmans, 18 years ago
Add xen and xen-common
File size: 28.6 KB

Line
1	/*
2	* kexec.c - kexec system call
3	* Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com>
4	*
5	* This source code is licensed under the GNU General Public License,
6	* Version 2. See the file COPYING for more details.
7	*/
8
9	#include <linux/capability.h>
10	#include <linux/mm.h>
11	#include <linux/file.h>
12	#include <linux/slab.h>
13	#include <linux/fs.h>
14	#include <linux/kexec.h>
15	#include <linux/spinlock.h>
16	#include <linux/list.h>
17	#include <linux/highmem.h>
18	#include <linux/syscalls.h>
19	#include <linux/reboot.h>
20	#include <linux/syscalls.h>
21	#include <linux/ioport.h>
22	#include <linux/hardirq.h>
23
24	#include <asm/page.h>
25	#include <asm/uaccess.h>
26	#include <asm/io.h>
27	#include <asm/system.h>
28	#include <asm/semaphore.h>
29
30	/* Per cpu memory for storing cpu states in case of system crash. */
31	note_buf_t* crash_notes;
32
33	/* Location of the reserved area for the crash kernel */
34	struct resource crashk_res = {
35	.name = "Crash kernel",
36	.start = 0,
37	.end = 0,
38	.flags = IORESOURCE_BUSY \| IORESOURCE_MEM
39	};
40
41	int kexec_should_crash(struct task_struct *p)
42	{
43	if (in_interrupt() \|\| !p->pid \|\| p->pid == 1 \|\| panic_on_oops)
44	return 1;
45	return 0;
46	}
47
48	/*
49	* When kexec transitions to the new kernel there is a one-to-one
50	* mapping between physical and virtual addresses. On processors
51	* where you can disable the MMU this is trivial, and easy. For
52	* others it is still a simple predictable page table to setup.
53	*
54	* In that environment kexec copies the new kernel to its final
55	* resting place. This means I can only support memory whose
56	* physical address can fit in an unsigned long. In particular
57	* addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled.
58	* If the assembly stub has more restrictive requirements
59	* KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be
60	* defined more restrictively in <asm/kexec.h>.
61	*
62	* The code for the transition from the current kernel to the
63	* the new kernel is placed in the control_code_buffer, whose size
64	* is given by KEXEC_CONTROL_CODE_SIZE. In the best case only a single
65	* page of memory is necessary, but some architectures require more.
66	* Because this memory must be identity mapped in the transition from
67	* virtual to physical addresses it must live in the range
68	* 0 - TASK_SIZE, as only the user space mappings are arbitrarily
69	* modifiable.
70	*
71	* The assembly stub in the control code buffer is passed a linked list
72	* of descriptor pages detailing the source pages of the new kernel,
73	* and the destination addresses of those source pages. As this data
74	* structure is not used in the context of the current OS, it must
75	* be self-contained.
76	*
77	* The code has been made to work with highmem pages and will use a
78	* destination page in its final resting place (if it happens
79	* to allocate it). The end product of this is that most of the
80	* physical address space, and most of RAM can be used.
81	*
82	* Future directions include:
83	* - allocating a page table with the control code buffer identity
84	* mapped, to simplify machine_kexec and make kexec_on_panic more
85	* reliable.
86	*/
87
88	/*
89	* KIMAGE_NO_DEST is an impossible destination address..., for
90	* allocating pages whose destination address we do not care about.
91	*/
92	#define KIMAGE_NO_DEST (-1UL)
93
94	static int kimage_is_destination_range(struct kimage *image,
95	unsigned long start, unsigned long end);
96	static struct page kimage_alloc_page(struct kimage image,
97	gfp_t gfp_mask,
98	unsigned long dest);
99
100	static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
101	unsigned long nr_segments,
102	struct kexec_segment __user *segments)
103	{
104	size_t segment_bytes;
105	struct kimage *image;
106	unsigned long i;
107	int result;
108
109	/* Allocate a controlling structure */
110	result = -ENOMEM;
111	image = kmalloc(sizeof(*image), GFP_KERNEL);
112	if (!image)
113	goto out;
114
115	memset(image, 0, sizeof(*image));
116	image->head = 0;
117	image->entry = &image->head;
118	image->last_entry = &image->head;
119	image->control_page = ~0; /* By default this does not apply */
120	image->start = entry;
121	image->type = KEXEC_TYPE_DEFAULT;
122
123	/* Initialize the list of control pages */
124	INIT_LIST_HEAD(&image->control_pages);
125
126	/* Initialize the list of destination pages */
127	INIT_LIST_HEAD(&image->dest_pages);
128
129	/* Initialize the list of unuseable pages */
130	INIT_LIST_HEAD(&image->unuseable_pages);
131
132	/* Read in the segments */
133	image->nr_segments = nr_segments;
134	segment_bytes = nr_segments * sizeof(*segments);
135	result = copy_from_user(image->segment, segments, segment_bytes);
136	if (result)
137	goto out;
138
139	/*
140	* Verify we have good destination addresses. The caller is
141	* responsible for making certain we don't attempt to load
142	* the new image into invalid or reserved areas of RAM. This
143	* just verifies it is an address we can use.
144	*
145	* Since the kernel does everything in page size chunks ensure
146	* the destination addreses are page aligned. Too many
147	* special cases crop of when we don't do this. The most
148	* insidious is getting overlapping destination addresses
149	* simply because addresses are changed to page size
150	* granularity.
151	*/
152	result = -EADDRNOTAVAIL;
153	for (i = 0; i < nr_segments; i++) {
154	unsigned long mstart, mend;
155
156	mstart = image->segment[i].mem;
157	mend = mstart + image->segment[i].memsz;
158	if ((mstart & ~PAGE_MASK) \|\| (mend & ~PAGE_MASK))
159	goto out;
160	if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
161	goto out;
162	}
163
164	/* Verify our destination addresses do not overlap.
165	* If we alloed overlapping destination addresses
166	* through very weird things can happen with no
167	* easy explanation as one segment stops on another.
168	*/
169	result = -EINVAL;
170	for (i = 0; i < nr_segments; i++) {
171	unsigned long mstart, mend;
172	unsigned long j;
173
174	mstart = image->segment[i].mem;
175	mend = mstart + image->segment[i].memsz;
176	for (j = 0; j < i; j++) {
177	unsigned long pstart, pend;
178	pstart = image->segment[j].mem;
179	pend = pstart + image->segment[j].memsz;
180	/* Do the segments overlap ? */
181	if ((mend > pstart) && (mstart < pend))
182	goto out;
183	}
184	}
185
186	/* Ensure our buffer sizes are strictly less than
187	* our memory sizes. This should always be the case,
188	* and it is easier to check up front than to be surprised
189	* later on.
190	*/
191	result = -EINVAL;
192	for (i = 0; i < nr_segments; i++) {
193	if (image->segment[i].bufsz > image->segment[i].memsz)
194	goto out;
195	}
196
197	result = 0;
198	out:
199	if (result == 0)
200	*rimage = image;
201	else
202	kfree(image);
203
204	return result;
205
206	}
207
208	static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
209	unsigned long nr_segments,
210	struct kexec_segment __user *segments)
211	{
212	int result;
213	struct kimage *image;
214
215	/* Allocate and initialize a controlling structure */
216	image = NULL;
217	result = do_kimage_alloc(&image, entry, nr_segments, segments);
218	if (result)
219	goto out;
220
221	*rimage = image;
222
223	/*
224	* Find a location for the control code buffer, and add it
225	* the vector of segments so that it's pages will also be
226	* counted as destination pages.
227	*/
228	result = -ENOMEM;
229	image->control_code_page = kimage_alloc_control_pages(image,
230	get_order(KEXEC_CONTROL_CODE_SIZE));
231	if (!image->control_code_page) {
232	printk(KERN_ERR "Could not allocate control_code_buffer\n");
233	goto out;
234	}
235
236	result = 0;
237	out:
238	if (result == 0)
239	*rimage = image;
240	else
241	kfree(image);
242
243	return result;
244	}
245
246	static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry,
247	unsigned long nr_segments,
248	struct kexec_segment __user *segments)
249	{
250	int result;
251	struct kimage *image;
252	unsigned long i;
253
254	image = NULL;
255	/* Verify we have a valid entry point */
256	if ((entry < crashk_res.start) \|\| (entry > crashk_res.end)) {
257	result = -EADDRNOTAVAIL;
258	goto out;
259	}
260
261	/* Allocate and initialize a controlling structure */
262	result = do_kimage_alloc(&image, entry, nr_segments, segments);
263	if (result)
264	goto out;
265
266	/* Enable the special crash kernel control page
267	* allocation policy.
268	*/
269	image->control_page = crashk_res.start;
270	image->type = KEXEC_TYPE_CRASH;
271
272	/*
273	* Verify we have good destination addresses. Normally
274	* the caller is responsible for making certain we don't
275	* attempt to load the new image into invalid or reserved
276	* areas of RAM. But crash kernels are preloaded into a
277	* reserved area of ram. We must ensure the addresses
278	* are in the reserved area otherwise preloading the
279	* kernel could corrupt things.
280	*/
281	result = -EADDRNOTAVAIL;
282	for (i = 0; i < nr_segments; i++) {
283	unsigned long mstart, mend;
284
285	mstart = image->segment[i].mem;
286	mend = mstart + image->segment[i].memsz - 1;
287	/* Ensure we are within the crash kernel limits */
288	if ((mstart < crashk_res.start) \|\| (mend > crashk_res.end))
289	goto out;
290	}
291
292	/*
293	* Find a location for the control code buffer, and add
294	* the vector of segments so that it's pages will also be
295	* counted as destination pages.
296	*/
297	result = -ENOMEM;
298	image->control_code_page = kimage_alloc_control_pages(image,
299	get_order(KEXEC_CONTROL_CODE_SIZE));
300	if (!image->control_code_page) {
301	printk(KERN_ERR "Could not allocate control_code_buffer\n");
302	goto out;
303	}
304
305	result = 0;
306	out:
307	if (result == 0)
308	*rimage = image;
309	else
310	kfree(image);
311
312	return result;
313	}
314
315	static int kimage_is_destination_range(struct kimage *image,
316	unsigned long start,
317	unsigned long end)
318	{
319	unsigned long i;
320
321	for (i = 0; i < image->nr_segments; i++) {
322	unsigned long mstart, mend;
323
324	mstart = image->segment[i].mem;
325	mend = mstart + image->segment[i].memsz;
326	if ((end > mstart) && (start < mend))
327	return 1;
328	}
329
330	return 0;
331	}
332
333	static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order, unsigned long limit)
334	{
335	struct page *pages;
336
337	pages = alloc_pages(gfp_mask, order);
338	if (pages) {
339	unsigned int count, i;
340	#ifdef CONFIG_XEN
341	int address_bits;
342
343	if (limit == ~0UL)
344	address_bits = BITS_PER_LONG;
345	else
346	address_bits = long_log2(limit);
347
348	if (xen_create_contiguous_region((unsigned long)page_address(pages),
349	order, address_bits) < 0) {
350	__free_pages(pages, order);
351	return NULL;
352	}
353	#endif
354	pages->mapping = NULL;
355	set_page_private(pages, order);
356	count = 1 << order;
357	for (i = 0; i < count; i++)
358	SetPageReserved(pages + i);
359	}
360
361	return pages;
362	}
363
364	static void kimage_free_pages(struct page *page)
365	{
366	unsigned int order, count, i;
367
368	order = page_private(page);
369	count = 1 << order;
370	for (i = 0; i < count; i++)
371	ClearPageReserved(page + i);
372	#ifdef CONFIG_XEN
373	xen_destroy_contiguous_region((unsigned long)page_address(page), order);
374	#endif
375	__free_pages(page, order);
376	}
377
378	static void kimage_free_page_list(struct list_head *list)
379	{
380	struct list_head pos, next;
381
382	list_for_each_safe(pos, next, list) {
383	struct page *page;
384
385	page = list_entry(pos, struct page, lru);
386	list_del(&page->lru);
387	kimage_free_pages(page);
388	}
389	}
390
391	static struct page kimage_alloc_normal_control_pages(struct kimage image,
392	unsigned int order)
393	{
394	/* Control pages are special, they are the intermediaries
395	* that are needed while we copy the rest of the pages
396	* to their final resting place. As such they must
397	* not conflict with either the destination addresses
398	* or memory the kernel is already using.
399	*
400	* The only case where we really need more than one of
401	* these are for architectures where we cannot disable
402	* the MMU and must instead generate an identity mapped
403	* page table for all of the memory.
404	*
405	* At worst this runs in O(N) of the image size.
406	*/
407	struct list_head extra_pages;
408	struct page *pages;
409	unsigned int count;
410
411	count = 1 << order;
412	INIT_LIST_HEAD(&extra_pages);
413
414	/* Loop while I can allocate a page and the page allocated
415	* is a destination page.
416	*/
417	do {
418	unsigned long pfn, epfn, addr, eaddr;
419
420	pages = kimage_alloc_pages(GFP_KERNEL, order, KEXEC_CONTROL_MEMORY_LIMIT);
421	if (!pages)
422	break;
423	pfn = kexec_page_to_pfn(pages);
424	epfn = pfn + count;
425	addr = pfn << PAGE_SHIFT;
426	eaddr = epfn << PAGE_SHIFT;
427	if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) \|\|
428	kimage_is_destination_range(image, addr, eaddr)) {
429	list_add(&pages->lru, &extra_pages);
430	pages = NULL;
431	}
432	} while (!pages);
433
434	if (pages) {
435	/* Remember the allocated page... */
436	list_add(&pages->lru, &image->control_pages);
437
438	/* Because the page is already in it's destination
439	* location we will never allocate another page at
440	* that address. Therefore kimage_alloc_pages
441	* will not return it (again) and we don't need
442	* to give it an entry in image->segment[].
443	*/
444	}
445	/* Deal with the destination pages I have inadvertently allocated.
446	*
447	* Ideally I would convert multi-page allocations into single
448	* page allocations, and add everyting to image->dest_pages.
449	*
450	* For now it is simpler to just free the pages.
451	*/
452	kimage_free_page_list(&extra_pages);
453
454	return pages;
455	}
456
457	#ifndef CONFIG_XEN
458	static struct page kimage_alloc_crash_control_pages(struct kimage image,
459	unsigned int order)
460	{
461	/* Control pages are special, they are the intermediaries
462	* that are needed while we copy the rest of the pages
463	* to their final resting place. As such they must
464	* not conflict with either the destination addresses
465	* or memory the kernel is already using.
466	*
467	* Control pages are also the only pags we must allocate
468	* when loading a crash kernel. All of the other pages
469	* are specified by the segments and we just memcpy
470	* into them directly.
471	*
472	* The only case where we really need more than one of
473	* these are for architectures where we cannot disable
474	* the MMU and must instead generate an identity mapped
475	* page table for all of the memory.
476	*
477	* Given the low demand this implements a very simple
478	* allocator that finds the first hole of the appropriate
479	* size in the reserved memory region, and allocates all
480	* of the memory up to and including the hole.
481	*/
482	unsigned long hole_start, hole_end, size;
483	struct page *pages;
484
485	pages = NULL;
486	size = (1 << order) << PAGE_SHIFT;
487	hole_start = (image->control_page + (size - 1)) & ~(size - 1);
488	hole_end = hole_start + size - 1;
489	while (hole_end <= crashk_res.end) {
490	unsigned long i;
491
492	if (hole_end > KEXEC_CONTROL_MEMORY_LIMIT)
493	break;
494	if (hole_end > crashk_res.end)
495	break;
496	/* See if I overlap any of the segments */
497	for (i = 0; i < image->nr_segments; i++) {
498	unsigned long mstart, mend;
499
500	mstart = image->segment[i].mem;
501	mend = mstart + image->segment[i].memsz - 1;
502	if ((hole_end >= mstart) && (hole_start <= mend)) {
503	/* Advance the hole to the end of the segment */
504	hole_start = (mend + (size - 1)) & ~(size - 1);
505	hole_end = hole_start + size - 1;
506	break;
507	}
508	}
509	/* If I don't overlap any segments I have found my hole! */
510	if (i == image->nr_segments) {
511	pages = kexec_pfn_to_page(hole_start >> PAGE_SHIFT);
512	break;
513	}
514	}
515	if (pages)
516	image->control_page = hole_end;
517
518	return pages;
519	}
520
521
522	struct page kimage_alloc_control_pages(struct kimage image,
523	unsigned int order)
524	{
525	struct page *pages = NULL;
526
527	switch (image->type) {
528	case KEXEC_TYPE_DEFAULT:
529	pages = kimage_alloc_normal_control_pages(image, order);
530	break;
531	case KEXEC_TYPE_CRASH:
532	pages = kimage_alloc_crash_control_pages(image, order);
533	break;
534	}
535
536	return pages;
537	}
538	#else /* !CONFIG_XEN */
539	struct page kimage_alloc_control_pages(struct kimage image,
540	unsigned int order)
541	{
542	return kimage_alloc_normal_control_pages(image, order);
543	}
544	#endif
545
546	static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
547	{
548	if (*image->entry != 0)
549	image->entry++;
550
551	if (image->entry == image->last_entry) {
552	kimage_entry_t *ind_page;
553	struct page *page;
554
555	page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST);
556	if (!page)
557	return -ENOMEM;
558
559	ind_page = page_address(page);
560	*image->entry = kexec_virt_to_phys(ind_page) \| IND_INDIRECTION;
561	image->entry = ind_page;
562	image->last_entry = ind_page +
563	((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
564	}
565	*image->entry = entry;
566	image->entry++;
567	*image->entry = 0;
568
569	return 0;
570	}
571
572	static int kimage_set_destination(struct kimage *image,
573	unsigned long destination)
574	{
575	int result;
576
577	destination &= PAGE_MASK;
578	result = kimage_add_entry(image, destination \| IND_DESTINATION);
579	if (result == 0)
580	image->destination = destination;
581
582	return result;
583	}
584
585
586	static int kimage_add_page(struct kimage *image, unsigned long page)
587	{
588	int result;
589
590	page &= PAGE_MASK;
591	result = kimage_add_entry(image, page \| IND_SOURCE);
592	if (result == 0)
593	image->destination += PAGE_SIZE;
594
595	return result;
596	}
597
598
599	static void kimage_free_extra_pages(struct kimage *image)
600	{
601	/* Walk through and free any extra destination pages I may have */
602	kimage_free_page_list(&image->dest_pages);
603
604	/* Walk through and free any unuseable pages I have cached */
605	kimage_free_page_list(&image->unuseable_pages);
606
607	}
608	static int kimage_terminate(struct kimage *image)
609	{
610	if (*image->entry != 0)
611	image->entry++;
612
613	*image->entry = IND_DONE;
614
615	return 0;
616	}
617
618	#define for_each_kimage_entry(image, ptr, entry) \
619	for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
620	ptr = (entry & IND_INDIRECTION)? \
621	kexec_phys_to_virt((entry & PAGE_MASK)): ptr +1)
622
623	static void kimage_free_entry(kimage_entry_t entry)
624	{
625	struct page *page;
626
627	page = kexec_pfn_to_page(entry >> PAGE_SHIFT);
628	kimage_free_pages(page);
629	}
630
631	static void kimage_free(struct kimage *image)
632	{
633	kimage_entry_t *ptr, entry;
634	kimage_entry_t ind = 0;
635
636	if (!image)
637	return;
638
639	#ifdef CONFIG_XEN
640	xen_machine_kexec_unload(image);
641	#endif
642
643	kimage_free_extra_pages(image);
644	for_each_kimage_entry(image, ptr, entry) {
645	if (entry & IND_INDIRECTION) {
646	/* Free the previous indirection page */
647	if (ind & IND_INDIRECTION)
648	kimage_free_entry(ind);
649	/* Save this indirection page until we are
650	* done with it.
651	*/
652	ind = entry;
653	}
654	else if (entry & IND_SOURCE)
655	kimage_free_entry(entry);
656	}
657	/* Free the final indirection page */
658	if (ind & IND_INDIRECTION)
659	kimage_free_entry(ind);
660
661	/* Handle any machine specific cleanup */
662	machine_kexec_cleanup(image);
663
664	/* Free the kexec control pages... */
665	kimage_free_page_list(&image->control_pages);
666	kfree(image);
667	}
668
669	static kimage_entry_t kimage_dst_used(struct kimage image,
670	unsigned long page)
671	{
672	kimage_entry_t *ptr, entry;
673	unsigned long destination = 0;
674
675	for_each_kimage_entry(image, ptr, entry) {
676	if (entry & IND_DESTINATION)
677	destination = entry & PAGE_MASK;
678	else if (entry & IND_SOURCE) {
679	if (page == destination)
680	return ptr;
681	destination += PAGE_SIZE;
682	}
683	}
684
685	return NULL;
686	}
687
688	static struct page kimage_alloc_page(struct kimage image,
689	gfp_t gfp_mask,
690	unsigned long destination)
691	{
692	/*
693	* Here we implement safeguards to ensure that a source page
694	* is not copied to its destination page before the data on
695	* the destination page is no longer useful.
696	*
697	* To do this we maintain the invariant that a source page is
698	* either its own destination page, or it is not a
699	* destination page at all.
700	*
701	* That is slightly stronger than required, but the proof
702	* that no problems will not occur is trivial, and the
703	* implementation is simply to verify.
704	*
705	* When allocating all pages normally this algorithm will run
706	* in O(N) time, but in the worst case it will run in O(N^2)
707	* time. If the runtime is a problem the data structures can
708	* be fixed.
709	*/
710	struct page *page;
711	unsigned long addr;
712
713	/*
714	* Walk through the list of destination pages, and see if I
715	* have a match.
716	*/
717	list_for_each_entry(page, &image->dest_pages, lru) {
718	addr = kexec_page_to_pfn(page) << PAGE_SHIFT;
719	if (addr == destination) {
720	list_del(&page->lru);
721	return page;
722	}
723	}
724	page = NULL;
725	while (1) {
726	kimage_entry_t *old;
727
728	/* Allocate a page, if we run out of memory give up */
729	page = kimage_alloc_pages(gfp_mask, 0, KEXEC_SOURCE_MEMORY_LIMIT);
730	if (!page)
731	return NULL;
732	/* If the page cannot be used file it away */
733	if (kexec_page_to_pfn(page) >
734	(KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
735	list_add(&page->lru, &image->unuseable_pages);
736	continue;
737	}
738	addr = kexec_page_to_pfn(page) << PAGE_SHIFT;
739
740	/* If it is the destination page we want use it */
741	if (addr == destination)
742	break;
743
744	/* If the page is not a destination page use it */
745	if (!kimage_is_destination_range(image, addr,
746	addr + PAGE_SIZE))
747	break;
748
749	/*
750	* I know that the page is someones destination page.
751	* See if there is already a source page for this
752	* destination page. And if so swap the source pages.
753	*/
754	old = kimage_dst_used(image, addr);
755	if (old) {
756	/* If so move it */
757	unsigned long old_addr;
758	struct page *old_page;
759
760	old_addr = *old & PAGE_MASK;
761	old_page = kexec_pfn_to_page(old_addr >> PAGE_SHIFT);
762	copy_highpage(page, old_page);
763	old = addr \| (old & ~PAGE_MASK);
764
765	/* The old page I have found cannot be a
766	* destination page, so return it.
767	*/
768	addr = old_addr;
769	page = old_page;
770	break;
771	}
772	else {
773	/* Place the page on the destination list I
774	* will use it later.
775	*/
776	list_add(&page->lru, &image->dest_pages);
777	}
778	}
779
780	return page;
781	}
782
783	static int kimage_load_normal_segment(struct kimage *image,
784	struct kexec_segment *segment)
785	{
786	unsigned long maddr;
787	unsigned long ubytes, mbytes;
788	int result;
789	unsigned char __user *buf;
790
791	result = 0;
792	buf = segment->buf;
793	ubytes = segment->bufsz;
794	mbytes = segment->memsz;
795	maddr = segment->mem;
796
797	result = kimage_set_destination(image, maddr);
798	if (result < 0)
799	goto out;
800
801	while (mbytes) {
802	struct page *page;
803	char *ptr;
804	size_t uchunk, mchunk;
805
806	page = kimage_alloc_page(image, GFP_HIGHUSER, maddr);
807	if (page == 0) {
808	result = -ENOMEM;
809	goto out;
810	}
811	result = kimage_add_page(image, kexec_page_to_pfn(page)
812	<< PAGE_SHIFT);
813	if (result < 0)
814	goto out;
815
816	ptr = kmap(page);
817	/* Start with a clear page */
818	memset(ptr, 0, PAGE_SIZE);
819	ptr += maddr & ~PAGE_MASK;
820	mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
821	if (mchunk > mbytes)
822	mchunk = mbytes;
823
824	uchunk = mchunk;
825	if (uchunk > ubytes)
826	uchunk = ubytes;
827
828	result = copy_from_user(ptr, buf, uchunk);
829	kunmap(page);
830	if (result) {
831	result = (result < 0) ? result : -EIO;
832	goto out;
833	}
834	ubytes -= uchunk;
835	maddr += mchunk;
836	buf += mchunk;
837	mbytes -= mchunk;
838	}
839	out:
840	return result;
841	}
842
843	#ifndef CONFIG_XEN
844	static int kimage_load_crash_segment(struct kimage *image,
845	struct kexec_segment *segment)
846	{
847	/* For crash dumps kernels we simply copy the data from
848	* user space to it's destination.
849	* We do things a page at a time for the sake of kmap.
850	*/
851	unsigned long maddr;
852	unsigned long ubytes, mbytes;
853	int result;
854	unsigned char __user *buf;
855
856	result = 0;
857	buf = segment->buf;
858	ubytes = segment->bufsz;
859	mbytes = segment->memsz;
860	maddr = segment->mem;
861	while (mbytes) {
862	struct page *page;
863	char *ptr;
864	size_t uchunk, mchunk;
865
866	page = kexec_pfn_to_page(maddr >> PAGE_SHIFT);
867	if (page == 0) {
868	result = -ENOMEM;
869	goto out;
870	}
871	ptr = kmap(page);
872	ptr += maddr & ~PAGE_MASK;
873	mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
874	if (mchunk > mbytes)
875	mchunk = mbytes;
876
877	uchunk = mchunk;
878	if (uchunk > ubytes) {
879	uchunk = ubytes;
880	/* Zero the trailing part of the page */
881	memset(ptr + uchunk, 0, mchunk - uchunk);
882	}
883	result = copy_from_user(ptr, buf, uchunk);
884	kunmap(page);
885	if (result) {
886	result = (result < 0) ? result : -EIO;
887	goto out;
888	}
889	ubytes -= uchunk;
890	maddr += mchunk;
891	buf += mchunk;
892	mbytes -= mchunk;
893	}
894	out:
895	return result;
896	}
897
898	static int kimage_load_segment(struct kimage *image,
899	struct kexec_segment *segment)
900	{
901	int result = -ENOMEM;
902
903	switch (image->type) {
904	case KEXEC_TYPE_DEFAULT:
905	result = kimage_load_normal_segment(image, segment);
906	break;
907	case KEXEC_TYPE_CRASH:
908	result = kimage_load_crash_segment(image, segment);
909	break;
910	}
911
912	return result;
913	}
914	#else /* CONFIG_XEN */
915	static int kimage_load_segment(struct kimage *image,
916	struct kexec_segment *segment)
917	{
918	return kimage_load_normal_segment(image, segment);
919	}
920	#endif
921
922	/*
923	* Exec Kernel system call: for obvious reasons only root may call it.
924	*
925	* This call breaks up into three pieces.
926	* - A generic part which loads the new kernel from the current
927	* address space, and very carefully places the data in the
928	* allocated pages.
929	*
930	* - A generic part that interacts with the kernel and tells all of
931	* the devices to shut down. Preventing on-going dmas, and placing
932	* the devices in a consistent state so a later kernel can
933	* reinitialize them.
934	*
935	* - A machine specific part that includes the syscall number
936	* and the copies the image to it's final destination. And
937	* jumps into the image at entry.
938	*
939	* kexec does not sync, or unmount filesystems so if you need
940	* that to happen you need to do that yourself.
941	*/
942	struct kimage *kexec_image;
943	struct kimage *kexec_crash_image;
944	/*
945	* A home grown binary mutex.
946	* Nothing can wait so this mutex is safe to use
947	* in interrupt context :)
948	*/
949	static int kexec_lock;
950
951	asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
952	struct kexec_segment __user *segments,
953	unsigned long flags)
954	{
955	struct kimage *dest_image, image;
956	int locked;
957	int result;
958
959	/* We only trust the superuser with rebooting the system. */
960	if (!capable(CAP_SYS_BOOT))
961	return -EPERM;
962
963	/*
964	* Verify we have a legal set of flags
965	* This leaves us room for future extensions.
966	*/
967	if ((flags & KEXEC_FLAGS) != (flags & ~KEXEC_ARCH_MASK))
968	return -EINVAL;
969
970	/* Verify we are on the appropriate architecture */
971	if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) &&
972	((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT))
973	return -EINVAL;
974
975	/* Put an artificial cap on the number
976	* of segments passed to kexec_load.
977	*/
978	if (nr_segments > KEXEC_SEGMENT_MAX)
979	return -EINVAL;
980
981	image = NULL;
982	result = 0;
983
984	/* Because we write directly to the reserved memory
985	* region when loading crash kernels we need a mutex here to
986	* prevent multiple crash kernels from attempting to load
987	* simultaneously, and to prevent a crash kernel from loading
988	* over the top of a in use crash kernel.
989	*
990	* KISS: always take the mutex.
991	*/
992	locked = xchg(&kexec_lock, 1);
993	if (locked)
994	return -EBUSY;
995
996	dest_image = &kexec_image;
997	if (flags & KEXEC_ON_CRASH)
998	dest_image = &kexec_crash_image;
999	if (nr_segments > 0) {
1000	unsigned long i;
1001
1002	/* Loading another kernel to reboot into */
1003	if ((flags & KEXEC_ON_CRASH) == 0)
1004	result = kimage_normal_alloc(&image, entry,
1005	nr_segments, segments);
1006	/* Loading another kernel to switch to if this one crashes */
1007	else if (flags & KEXEC_ON_CRASH) {
1008	/* Free any current crash dump kernel before
1009	* we corrupt it.
1010	*/
1011	kimage_free(xchg(&kexec_crash_image, NULL));
1012	result = kimage_crash_alloc(&image, entry,
1013	nr_segments, segments);
1014	}
1015	if (result)
1016	goto out;
1017
1018	result = machine_kexec_prepare(image);
1019	if (result)
1020	goto out;
1021
1022	for (i = 0; i < nr_segments; i++) {
1023	result = kimage_load_segment(image, &image->segment[i]);
1024	if (result)
1025	goto out;
1026	}
1027	result = kimage_terminate(image);
1028	if (result)
1029	goto out;
1030	}
1031	#ifdef CONFIG_XEN
1032	if (image) {
1033	result = xen_machine_kexec_load(image);
1034	if (result)
1035	goto out;
1036	}
1037	#endif
1038	/* Install the new kernel, and Uninstall the old */
1039	image = xchg(dest_image, image);
1040
1041	out:
1042	xchg(&kexec_lock, 0); /* Release the mutex */
1043	kimage_free(image);
1044
1045	return result;
1046	}
1047
1048	#ifdef CONFIG_COMPAT
1049	asmlinkage long compat_sys_kexec_load(unsigned long entry,
1050	unsigned long nr_segments,
1051	struct compat_kexec_segment __user *segments,
1052	unsigned long flags)
1053	{
1054	struct compat_kexec_segment in;
1055	struct kexec_segment out, __user *ksegments;
1056	unsigned long i, result;
1057
1058	/* Don't allow clients that don't understand the native
1059	* architecture to do anything.
1060	*/
1061	if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT)
1062	return -EINVAL;
1063
1064	if (nr_segments > KEXEC_SEGMENT_MAX)
1065	return -EINVAL;
1066
1067	ksegments = compat_alloc_user_space(nr_segments * sizeof(out));
1068	for (i=0; i < nr_segments; i++) {
1069	result = copy_from_user(&in, &segments[i], sizeof(in));
1070	if (result)
1071	return -EFAULT;
1072
1073	out.buf = compat_ptr(in.buf);
1074	out.bufsz = in.bufsz;
1075	out.mem = in.mem;
1076	out.memsz = in.memsz;
1077
1078	result = copy_to_user(&ksegments[i], &out, sizeof(out));
1079	if (result)
1080	return -EFAULT;
1081	}
1082
1083	return sys_kexec_load(entry, nr_segments, ksegments, flags);
1084	}
1085	#endif
1086
1087	void crash_kexec(struct pt_regs *regs)
1088	{
1089	int locked;
1090
1091	/* Take the kexec_lock here to prevent sys_kexec_load
1092	* running on one cpu from replacing the crash kernel
1093	* we are using after a panic on a different cpu.
1094	*
1095	* If the crash kernel was not located in a fixed area
1096	* of memory the xchg(&kexec_crash_image) would be
1097	* sufficient. But since I reuse the memory...
1098	*/
1099	locked = xchg(&kexec_lock, 1);
1100	if (!locked) {
1101	if (kexec_crash_image) {
1102	struct pt_regs fixed_regs;
1103	crash_setup_regs(&fixed_regs, regs);
1104	machine_crash_shutdown(&fixed_regs);
1105	machine_kexec(kexec_crash_image);
1106	}
1107	xchg(&kexec_lock, 0);
1108	}
1109	}
1110
1111	static int __init crash_notes_memory_init(void)
1112	{
1113	/* Allocate memory for saving cpu registers. */
1114	crash_notes = alloc_percpu(note_buf_t);
1115	if (!crash_notes) {
1116	printk("Kexec: Memory allocation for saving cpu register"
1117	" states failed\n");
1118	return -ENOMEM;
1119	}
1120	return 0;
1121	}
1122	module_init(crash_notes_memory_init)

Note: See TracBrowser for help on using the repository browser.

Download in other formats: