Please file new bugs on Launchpad: Invirt or XVM (if you're not sure which, just pick one)

Context Navigation

source: trunk/packages/xen-3.1/xen-3.1/linux-2.6-xen-sparse/arch/x86_64/mm/pageattr-xen.c @ 34

Last change on this file since 34 was 34, checked in by hartmans, 17 years ago
Add xen and xen-common
File size: 10.6 KB

Line
1	/*
2	* Copyright 2002 Andi Kleen, SuSE Labs.
3	* Thanks to Ben LaHaise for precious feedback.
4	*/
5
6	#include <linux/mm.h>
7	#include <linux/sched.h>
8	#include <linux/highmem.h>
9	#include <linux/module.h>
10	#include <linux/slab.h>
11	#include <asm/uaccess.h>
12	#include <asm/processor.h>
13	#include <asm/tlbflush.h>
14	#include <asm/io.h>
15
16	#ifdef CONFIG_XEN
17	#include <asm/pgalloc.h>
18	#include <asm/mmu_context.h>
19
20	LIST_HEAD(mm_unpinned);
21	DEFINE_SPINLOCK(mm_unpinned_lock);
22
23	static inline void mm_walk_set_prot(void *pt, pgprot_t flags)
24	{
25	struct page *page = virt_to_page(pt);
26	unsigned long pfn = page_to_pfn(page);
27	int rc;
28
29	rc = HYPERVISOR_update_va_mapping(
30	(unsigned long)__va(pfn << PAGE_SHIFT),
31	pfn_pte(pfn, flags), 0);
32	if (rc)
33	BUG();
34	}
35
36	static void mm_walk(struct mm_struct *mm, pgprot_t flags)
37	{
38	pgd_t *pgd;
39	pud_t *pud;
40	pmd_t *pmd;
41	pte_t *pte;
42	int g,u,m;
43
44	pgd = mm->pgd;
45	/*
46	* Cannot iterate up to USER_PTRS_PER_PGD as these pagetables may not
47	* be the 'current' task's pagetables (e.g., current may be 32-bit,
48	* but the pagetables may be for a 64-bit task).
49	* Subtracting 1 from TASK_SIZE64 means the loop limit is correct
50	* regardless of whether TASK_SIZE64 is a multiple of PGDIR_SIZE.
51	*/
52	for (g = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) {
53	if (pgd_none(*pgd))
54	continue;
55	pud = pud_offset(pgd, 0);
56	if (PTRS_PER_PUD > 1) /* not folded */
57	mm_walk_set_prot(pud,flags);
58	for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
59	if (pud_none(*pud))
60	continue;
61	pmd = pmd_offset(pud, 0);
62	if (PTRS_PER_PMD > 1) /* not folded */
63	mm_walk_set_prot(pmd,flags);
64	for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
65	if (pmd_none(*pmd))
66	continue;
67	pte = pte_offset_kernel(pmd,0);
68	mm_walk_set_prot(pte,flags);
69	}
70	}
71	}
72	}
73
74	void mm_pin(struct mm_struct *mm)
75	{
76	if (xen_feature(XENFEAT_writable_page_tables))
77	return;
78
79	spin_lock(&mm->page_table_lock);
80
81	mm_walk(mm, PAGE_KERNEL_RO);
82	if (HYPERVISOR_update_va_mapping(
83	(unsigned long)mm->pgd,
84	pfn_pte(virt_to_phys(mm->pgd)>>PAGE_SHIFT, PAGE_KERNEL_RO),
85	UVMF_TLB_FLUSH))
86	BUG();
87	if (HYPERVISOR_update_va_mapping(
88	(unsigned long)__user_pgd(mm->pgd),
89	pfn_pte(virt_to_phys(__user_pgd(mm->pgd))>>PAGE_SHIFT,
90	PAGE_KERNEL_RO),
91	UVMF_TLB_FLUSH))
92	BUG();
93	xen_pgd_pin(__pa(mm->pgd)); /* kernel */
94	xen_pgd_pin(__pa(__user_pgd(mm->pgd))); /* user */
95	mm->context.pinned = 1;
96	spin_lock(&mm_unpinned_lock);
97	list_del(&mm->context.unpinned);
98	spin_unlock(&mm_unpinned_lock);
99
100	spin_unlock(&mm->page_table_lock);
101	}
102
103	void mm_unpin(struct mm_struct *mm)
104	{
105	if (xen_feature(XENFEAT_writable_page_tables))
106	return;
107
108	spin_lock(&mm->page_table_lock);
109
110	xen_pgd_unpin(__pa(mm->pgd));
111	xen_pgd_unpin(__pa(__user_pgd(mm->pgd)));
112	if (HYPERVISOR_update_va_mapping(
113	(unsigned long)mm->pgd,
114	pfn_pte(virt_to_phys(mm->pgd)>>PAGE_SHIFT, PAGE_KERNEL), 0))
115	BUG();
116	if (HYPERVISOR_update_va_mapping(
117	(unsigned long)__user_pgd(mm->pgd),
118	pfn_pte(virt_to_phys(__user_pgd(mm->pgd))>>PAGE_SHIFT,
119	PAGE_KERNEL), 0))
120	BUG();
121	mm_walk(mm, PAGE_KERNEL);
122	xen_tlb_flush();
123	mm->context.pinned = 0;
124	spin_lock(&mm_unpinned_lock);
125	list_add(&mm->context.unpinned, &mm_unpinned);
126	spin_unlock(&mm_unpinned_lock);
127
128	spin_unlock(&mm->page_table_lock);
129	}
130
131	void mm_pin_all(void)
132	{
133	if (xen_feature(XENFEAT_writable_page_tables))
134	return;
135
136	/*
137	* Allow uninterrupted access to the mm_unpinned list. We don't
138	* actually take the mm_unpinned_lock as it is taken inside mm_pin().
139	* All other CPUs must be at a safe point (e.g., in stop_machine
140	* or offlined entirely).
141	*/
142	preempt_disable();
143	while (!list_empty(&mm_unpinned))
144	mm_pin(list_entry(mm_unpinned.next, struct mm_struct,
145	context.unpinned));
146	preempt_enable();
147	}
148
149	void _arch_dup_mmap(struct mm_struct *mm)
150	{
151	if (!mm->context.pinned)
152	mm_pin(mm);
153	}
154
155	void _arch_exit_mmap(struct mm_struct *mm)
156	{
157	struct task_struct *tsk = current;
158
159	task_lock(tsk);
160
161	/*
162	* We aggressively remove defunct pgd from cr3. We execute unmap_vmas()
163	* much faster this way, as no tlb flushes means bigger wrpt batches.
164	*/
165	if (tsk->active_mm == mm) {
166	tsk->active_mm = &init_mm;
167	atomic_inc(&init_mm.mm_count);
168
169	switch_mm(mm, &init_mm, tsk);
170
171	atomic_dec(&mm->mm_count);
172	BUG_ON(atomic_read(&mm->mm_count) == 0);
173	}
174
175	task_unlock(tsk);
176
177	if ( mm->context.pinned && (atomic_read(&mm->mm_count) == 1) &&
178	!mm->context.has_foreign_mappings )
179	mm_unpin(mm);
180	}
181
182	struct page pte_alloc_one(struct mm_struct mm, unsigned long address)
183	{
184	struct page *pte;
185
186	pte = alloc_pages(GFP_KERNEL\|__GFP_REPEAT\|__GFP_ZERO, 0);
187	if (pte) {
188	SetPageForeign(pte, pte_free);
189	init_page_count(pte);
190	}
191	return pte;
192	}
193
194	void pte_free(struct page *pte)
195	{
196	unsigned long va = (unsigned long)__va(page_to_pfn(pte)<<PAGE_SHIFT);
197
198	if (!pte_write(*virt_to_ptep(va)))
199	if (HYPERVISOR_update_va_mapping(
200	va, pfn_pte(page_to_pfn(pte), PAGE_KERNEL), 0))
201	BUG();
202
203	ClearPageForeign(pte);
204	init_page_count(pte);
205
206	__free_page(pte);
207	}
208	#endif /* CONFIG_XEN */
209
210	static inline pte_t *lookup_address(unsigned long address)
211	{
212	pgd_t *pgd = pgd_offset_k(address);
213	pud_t *pud;
214	pmd_t *pmd;
215	pte_t *pte;
216	if (pgd_none(*pgd))
217	return NULL;
218	pud = pud_offset(pgd, address);
219	if (!pud_present(*pud))
220	return NULL;
221	pmd = pmd_offset(pud, address);
222	if (!pmd_present(*pmd))
223	return NULL;
224	if (pmd_large(*pmd))
225	return (pte_t *)pmd;
226	pte = pte_offset_kernel(pmd, address);
227	if (pte && !pte_present(*pte))
228	pte = NULL;
229	return pte;
230	}
231
232	static struct page *split_large_page(unsigned long address, pgprot_t prot,
233	pgprot_t ref_prot)
234	{
235	int i;
236	unsigned long addr;
237	struct page *base = alloc_pages(GFP_KERNEL, 0);
238	pte_t *pbase;
239	if (!base)
240	return NULL;
241	/*
242	* page_private is used to track the number of entries in
243	* the page table page have non standard attributes.
244	*/
245	SetPagePrivate(base);
246	page_private(base) = 0;
247
248	address = __pa(address);
249	addr = address & LARGE_PAGE_MASK;
250	pbase = (pte_t *)page_address(base);
251	for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
252	pbase[i] = pfn_pte(addr >> PAGE_SHIFT,
253	addr == address ? prot : ref_prot);
254	}
255	return base;
256	}
257
258
259	static void flush_kernel_map(void *address)
260	{
261	if (0 && address && cpu_has_clflush) {
262	/* is this worth it? */
263	int i;
264	for (i = 0; i < PAGE_SIZE; i += boot_cpu_data.x86_clflush_size)
265	asm volatile("clflush (%0)" :: "r" (address + i));
266	} else
267	asm volatile("wbinvd":::"memory");
268	if (address)
269	__flush_tlb_one(address);
270	else
271	__flush_tlb_all();
272	}
273
274
275	static inline void flush_map(unsigned long address)
276	{
277	on_each_cpu(flush_kernel_map, (void *)address, 1, 1);
278	}
279
280	static struct page deferred_pages; / protected by init_mm.mmap_sem */
281
282	static inline void save_page(struct page *fpage)
283	{
284	fpage->lru.next = (struct list_head *)deferred_pages;
285	deferred_pages = fpage;
286	}
287
288	/*
289	* No more special protections in this 2/4MB area - revert to a
290	* large page again.
291	*/
292	static void revert_page(unsigned long address, pgprot_t ref_prot)
293	{
294	pgd_t *pgd;
295	pud_t *pud;
296	pmd_t *pmd;
297	pte_t large_pte;
298
299	pgd = pgd_offset_k(address);
300	BUG_ON(pgd_none(*pgd));
301	pud = pud_offset(pgd,address);
302	BUG_ON(pud_none(*pud));
303	pmd = pmd_offset(pud, address);
304	BUG_ON(pmd_val(*pmd) & _PAGE_PSE);
305	pgprot_val(ref_prot) \|= _PAGE_PSE;
306	large_pte = mk_pte_phys(__pa(address) & LARGE_PAGE_MASK, ref_prot);
307	set_pte((pte_t *)pmd, large_pte);
308	}
309
310	static int
311	__change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot,
312	pgprot_t ref_prot)
313	{
314	pte_t *kpte;
315	struct page *kpte_page;
316	unsigned kpte_flags;
317	pgprot_t ref_prot2;
318	kpte = lookup_address(address);
319	if (!kpte) return 0;
320	kpte_page = virt_to_page(((unsigned long)kpte) & PAGE_MASK);
321	kpte_flags = pte_val(*kpte);
322	if (pgprot_val(prot) != pgprot_val(ref_prot)) {
323	if ((kpte_flags & _PAGE_PSE) == 0) {
324	set_pte(kpte, pfn_pte(pfn, prot));
325	} else {
326	/*
327	* split_large_page will take the reference for this
328	* change_page_attr on the split page.
329	*/
330
331	struct page *split;
332	ref_prot2 = __pgprot(pgprot_val(pte_pgprot(*lookup_address(address))) & ~(1<<_PAGE_BIT_PSE));
333
334	split = split_large_page(address, prot, ref_prot2);
335	if (!split)
336	return -ENOMEM;
337	set_pte(kpte,mk_pte(split, ref_prot2));
338	kpte_page = split;
339	}
340	page_private(kpte_page)++;
341	} else if ((kpte_flags & _PAGE_PSE) == 0) {
342	set_pte(kpte, pfn_pte(pfn, ref_prot));
343	BUG_ON(page_private(kpte_page) == 0);
344	page_private(kpte_page)--;
345	} else
346	BUG();
347
348	/* on x86-64 the direct mapping set at boot is not using 4k pages */
349	/*
350	* ..., but the XEN guest kernels (currently) do:
351	* If the pte was reserved, it means it was created at boot
352	* time (not via split_large_page) and in turn we must not
353	* replace it with a large page.
354	*/
355	#ifndef CONFIG_XEN
356	BUG_ON(PageReserved(kpte_page));
357	#else
358	if (PageReserved(kpte_page))
359	return 0;
360	#endif
361
362	if (page_private(kpte_page) == 0) {
363	save_page(kpte_page);
364	revert_page(address, ref_prot);
365	}
366	return 0;
367	}
368
369	/*
370	* Change the page attributes of an page in the linear mapping.
371	*
372	* This should be used when a page is mapped with a different caching policy
373	* than write-back somewhere - some CPUs do not like it when mappings with
374	* different caching policies exist. This changes the page attributes of the
375	* in kernel linear mapping too.
376	*
377	* The caller needs to ensure that there are no conflicting mappings elsewhere.
378	* This function only deals with the kernel linear map.
379	*
380	* Caller must call global_flush_tlb() after this.
381	*/
382	int change_page_attr_addr(unsigned long address, int numpages, pgprot_t prot)
383	{
384	int err = 0;
385	int i;
386
387	down_write(&init_mm.mmap_sem);
388	for (i = 0; i < numpages; i++, address += PAGE_SIZE) {
389	unsigned long pfn = __pa(address) >> PAGE_SHIFT;
390
391	err = __change_page_attr(address, pfn, prot, PAGE_KERNEL);
392	if (err)
393	break;
394	/* Handle kernel mapping too which aliases part of the
395	* lowmem */
396	if (__pa(address) < KERNEL_TEXT_SIZE) {
397	unsigned long addr2;
398	pgprot_t prot2 = prot;
399	addr2 = __START_KERNEL_map + __pa(address);
400	pgprot_val(prot2) &= ~_PAGE_NX;
401	err = __change_page_attr(addr2, pfn, prot2, PAGE_KERNEL_EXEC);
402	}
403	}
404	up_write(&init_mm.mmap_sem);
405	return err;
406	}
407
408	/* Don't call this for MMIO areas that may not have a mem_map entry */
409	int change_page_attr(struct page *page, int numpages, pgprot_t prot)
410	{
411	unsigned long addr = (unsigned long)page_address(page);
412	return change_page_attr_addr(addr, numpages, prot);
413	}
414
415	void global_flush_tlb(void)
416	{
417	struct page *dpage;
418
419	down_read(&init_mm.mmap_sem);
420	dpage = xchg(&deferred_pages, NULL);
421	up_read(&init_mm.mmap_sem);
422
423	flush_map((dpage && !dpage->lru.next) ? (unsigned long)page_address(dpage) : 0);
424	while (dpage) {
425	struct page *tmp = dpage;
426	dpage = (struct page *)dpage->lru.next;
427	ClearPagePrivate(tmp);
428	__free_page(tmp);
429	}
430	}
431
432	EXPORT_SYMBOL(change_page_attr);
433	EXPORT_SYMBOL(global_flush_tlb);

Note: See TracBrowser for help on using the repository browser.

Download in other formats: