Please file new bugs on Launchpad: Invirt or XVM (if you're not sure which, just pick one)

Context Navigation

source: trunk/packages/xen-common/xen-common/linux-2.6-xen-sparse/mm/page_alloc.c @ 34

Last change on this file since 34 was 34, checked in by hartmans, 17 years ago
Add xen and xen-common
File size: 62.8 KB

Rev	Line
[34]	1	/*
	2	* linux/mm/page_alloc.c
	3	*
	4	* Manages the free list, the system allocates free pages here.
	5	* Note that kmalloc() lives in slab.c
	6	*
	7	* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
	8	* Swap reorganised 29.12.95, Stephen Tweedie
	9	* Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
	10	* Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
	11	* Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
	12	* Zone balancing, Kanoj Sarcar, SGI, Jan 2000
	13	* Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
	14	* (lots of bits borrowed from Ingo Molnar & Andrew Morton)
	15	*/
	16
	17	#include <linux/stddef.h>
	18	#include <linux/mm.h>
	19	#include <linux/swap.h>
	20	#include <linux/interrupt.h>
	21	#include <linux/pagemap.h>
	22	#include <linux/bootmem.h>
	23	#include <linux/compiler.h>
	24	#include <linux/kernel.h>
	25	#include <linux/module.h>
	26	#include <linux/suspend.h>
	27	#include <linux/pagevec.h>
	28	#include <linux/blkdev.h>
	29	#include <linux/slab.h>
	30	#include <linux/notifier.h>
	31	#include <linux/topology.h>
	32	#include <linux/sysctl.h>
	33	#include <linux/cpu.h>
	34	#include <linux/cpuset.h>
	35	#include <linux/memory_hotplug.h>
	36	#include <linux/nodemask.h>
	37	#include <linux/vmalloc.h>
	38	#include <linux/mempolicy.h>
	39	#include <linux/stop_machine.h>
	40
	41	#include <asm/tlbflush.h>
	42	#include <asm/div64.h>
	43	#include "internal.h"
	44
	45	/*
	46	* MCD - HACK: Find somewhere to initialize this EARLY, or make this
	47	* initializer cleaner
	48	*/
	49	nodemask_t node_online_map __read_mostly = { { [0] = 1UL } };
	50	EXPORT_SYMBOL(node_online_map);
	51	nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL;
	52	EXPORT_SYMBOL(node_possible_map);
	53	unsigned long totalram_pages __read_mostly;
	54	unsigned long totalhigh_pages __read_mostly;
	55	unsigned long totalreserve_pages __read_mostly;
	56	long nr_swap_pages;
	57	int percpu_pagelist_fraction;
	58
	59	static void __free_pages_ok(struct page *page, unsigned int order);
	60
	61	/*
	62	* results with 256, 32 in the lowmem_reserve sysctl:
	63	* 1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
	64	* 1G machine -> (16M dma, 784M normal, 224M high)
	65	* NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
	66	* HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
	67	* HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA
	68	*
	69	* TBD: should special case ZONE_DMA32 machines here - in those we normally
	70	* don't need any ZONE_NORMAL reservation
	71	*/
	72	int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 256, 32 };
	73
	74	EXPORT_SYMBOL(totalram_pages);
	75
	76	/*
	77	* Used by page_zone() to look up the address of the struct zone whose
	78	* id is encoded in the upper bits of page->flags
	79	*/
	80	struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly;
	81	EXPORT_SYMBOL(zone_table);
	82
	83	static char *zone_names[MAX_NR_ZONES] = { "DMA", "DMA32", "Normal", "HighMem" };
	84	int min_free_kbytes = 1024;
	85
	86	unsigned long __meminitdata nr_kernel_pages;
	87	unsigned long __meminitdata nr_all_pages;
	88
	89	#ifdef CONFIG_DEBUG_VM
	90	static int page_outside_zone_boundaries(struct zone zone, struct page page)
	91	{
	92	int ret = 0;
	93	unsigned seq;
	94	unsigned long pfn = page_to_pfn(page);
	95
	96	do {
	97	seq = zone_span_seqbegin(zone);
	98	if (pfn >= zone->zone_start_pfn + zone->spanned_pages)
	99	ret = 1;
	100	else if (pfn < zone->zone_start_pfn)
	101	ret = 1;
	102	} while (zone_span_seqretry(zone, seq));
	103
	104	return ret;
	105	}
	106
	107	static int page_is_consistent(struct zone zone, struct page page)
	108	{
	109	#ifdef CONFIG_HOLES_IN_ZONE
	110	if (!pfn_valid(page_to_pfn(page)))
	111	return 0;
	112	#endif
	113	if (zone != page_zone(page))
	114	return 0;
	115
	116	return 1;
	117	}
	118	/*
	119	* Temporary debugging check for pages not lying within a given zone.
	120	*/
	121	static int bad_range(struct zone zone, struct page page)
	122	{
	123	if (page_outside_zone_boundaries(zone, page))
	124	return 1;
	125	if (!page_is_consistent(zone, page))
	126	return 1;
	127
	128	return 0;
	129	}
	130
	131	#else
	132	static inline int bad_range(struct zone zone, struct page page)
	133	{
	134	return 0;
	135	}
	136	#endif
	137
	138	static void bad_page(struct page *page)
	139	{
	140	printk(KERN_EMERG "Bad page state in process '%s'\n"
	141	KERN_EMERG "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n"
	142	KERN_EMERG "Trying to fix it up, but a reboot is needed\n"
	143	KERN_EMERG "Backtrace:\n",
	144	current->comm, page, (int)(2*sizeof(unsigned long)),
	145	(unsigned long)page->flags, page->mapping,
	146	page_mapcount(page), page_count(page));
	147	dump_stack();
	148	page->flags &= ~(1 << PG_lru \|
	149	1 << PG_private \|
	150	1 << PG_locked \|
	151	1 << PG_active \|
	152	1 << PG_dirty \|
	153	1 << PG_reclaim \|
	154	1 << PG_slab \|
	155	1 << PG_swapcache \|
	156	1 << PG_writeback \|
	157	1 << PG_buddy \|
	158	#ifdef CONFIG_X86_XEN
	159	1 << PG_pinned \|
	160	#endif
	161	1 << PG_foreign );
	162	set_page_count(page, 0);
	163	reset_page_mapcount(page);
	164	page->mapping = NULL;
	165	add_taint(TAINT_BAD_PAGE);
	166	}
	167
	168	/*
	169	* Higher-order pages are called "compound pages". They are structured thusly:
	170	*
	171	* The first PAGE_SIZE page is called the "head page".
	172	*
	173	* The remaining PAGE_SIZE pages are called "tail pages".
	174	*
	175	* All pages have PG_compound set. All pages have their ->private pointing at
	176	* the head page (even the head page has this).
	177	*
	178	* The first tail page's ->lru.next holds the address of the compound page's
	179	* put_page() function. Its ->lru.prev holds the order of allocation.
	180	* This usage means that zero-order pages may not be compound.
	181	*/
	182
	183	static void free_compound_page(struct page *page)
	184	{
	185	__free_pages_ok(page, (unsigned long)page[1].lru.prev);
	186	}
	187
	188	static void prep_compound_page(struct page *page, unsigned long order)
	189	{
	190	int i;
	191	int nr_pages = 1 << order;
	192
	193	page[1].lru.next = (void )free_compound_page; / set dtor */
	194	page[1].lru.prev = (void *)order;
	195	for (i = 0; i < nr_pages; i++) {
	196	struct page *p = page + i;
	197
	198	__SetPageCompound(p);
	199	set_page_private(p, (unsigned long)page);
	200	}
	201	}
	202
	203	static void destroy_compound_page(struct page *page, unsigned long order)
	204	{
	205	int i;
	206	int nr_pages = 1 << order;
	207
	208	if (unlikely((unsigned long)page[1].lru.prev != order))
	209	bad_page(page);
	210
	211	for (i = 0; i < nr_pages; i++) {
	212	struct page *p = page + i;
	213
	214	if (unlikely(!PageCompound(p) \|
	215	(page_private(p) != (unsigned long)page)))
	216	bad_page(page);
	217	__ClearPageCompound(p);
	218	}
	219	}
	220
	221	static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
	222	{
	223	int i;
	224
	225	BUG_ON((gfp_flags & (__GFP_WAIT \| __GFP_HIGHMEM)) == __GFP_HIGHMEM);
	226	/*
	227	* clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO
	228	* and __GFP_HIGHMEM from hard or soft interrupt context.
	229	*/
	230	BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());
	231	for (i = 0; i < (1 << order); i++)
	232	clear_highpage(page + i);
	233	}
	234
	235	/*
	236	* function for dealing with page's order in buddy system.
	237	* zone->lock is already acquired when we use these.
	238	* So, we don't need atomic page->flags operations here.
	239	*/
	240	static inline unsigned long page_order(struct page *page)
	241	{
	242	return page_private(page);
	243	}
	244
	245	static inline void set_page_order(struct page *page, int order)
	246	{
	247	set_page_private(page, order);
	248	__SetPageBuddy(page);
	249	}
	250
	251	static inline void rmv_page_order(struct page *page)
	252	{
	253	__ClearPageBuddy(page);
	254	set_page_private(page, 0);
	255	}
	256
	257	/*
	258	* Locate the struct page for both the matching buddy in our
	259	* pair (buddy1) and the combined O(n+1) page they form (page).
	260	*
	261	* 1) Any buddy B1 will have an order O twin B2 which satisfies
	262	* the following equation:
	263	* B2 = B1 ^ (1 << O)
	264	* For example, if the starting buddy (buddy2) is #8 its order
	265	* 1 buddy is #10:
	266	* B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10
	267	*
	268	* 2) Any buddy B will have an order O+1 parent P which
	269	* satisfies the following equation:
	270	* P = B & ~(1 << O)
	271	*
	272	* Assumption: *_mem_map is contiguous at least up to MAX_ORDER
	273	*/
	274	static inline struct page *
	275	__page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order)
	276	{
	277	unsigned long buddy_idx = page_idx ^ (1 << order);
	278
	279	return page + (buddy_idx - page_idx);
	280	}
	281
	282	static inline unsigned long
	283	__find_combined_index(unsigned long page_idx, unsigned int order)
	284	{
	285	return (page_idx & ~(1 << order));
	286	}
	287
	288	/*
	289	* This function checks whether a page is free && is the buddy
	290	* we can do coalesce a page and its buddy if
	291	* (a) the buddy is not in a hole &&
	292	* (b) the buddy is in the buddy system &&
	293	* (c) a page and its buddy have the same order &&
	294	* (d) a page and its buddy are in the same zone.
	295	*
	296	* For recording whether a page is in the buddy system, we use PG_buddy.
	297	* Setting, clearing, and testing PG_buddy is serialized by zone->lock.
	298	*
	299	* For recording page's order, we use page_private(page).
	300	*/
	301	static inline int page_is_buddy(struct page page, struct page buddy,
	302	int order)
	303	{
	304	#ifdef CONFIG_HOLES_IN_ZONE
	305	if (!pfn_valid(page_to_pfn(buddy)))
	306	return 0;
	307	#endif
	308
	309	if (page_zone_id(page) != page_zone_id(buddy))
	310	return 0;
	311
	312	if (PageBuddy(buddy) && page_order(buddy) == order) {
	313	BUG_ON(page_count(buddy) != 0);
	314	return 1;
	315	}
	316	return 0;
	317	}
	318
	319	/*
	320	* Freeing function for a buddy system allocator.
	321	*
	322	* The concept of a buddy system is to maintain direct-mapped table
	323	* (containing bit values) for memory blocks of various "orders".
	324	* The bottom level table contains the map for the smallest allocatable
	325	* units of memory (here, pages), and each level above it describes
	326	* pairs of units from the levels below, hence, "buddies".
	327	* At a high level, all that happens here is marking the table entry
	328	* at the bottom level available, and propagating the changes upward
	329	* as necessary, plus some accounting needed to play nicely with other
	330	* parts of the VM system.
	331	* At each level, we keep a list of pages, which are heads of continuous
	332	* free pages of length of (1 << order) and marked with PG_buddy. Page's
	333	* order is recorded in page_private(page) field.
	334	* So when we are allocating or freeing one, we can derive the state of the
	335	* other. That is, if we allocate a small block, and both were
	336	* free, the remainder of the region must be split into blocks.
	337	* If a block is freed, and its buddy is also free, then this
	338	* triggers coalescing into a block of larger size.
	339	*
	340	* -- wli
	341	*/
	342
	343	static inline void __free_one_page(struct page *page,
	344	struct zone *zone, unsigned int order)
	345	{
	346	unsigned long page_idx;
	347	int order_size = 1 << order;
	348
	349	if (unlikely(PageCompound(page)))
	350	destroy_compound_page(page, order);
	351
	352	page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
	353
	354	BUG_ON(page_idx & (order_size - 1));
	355	BUG_ON(bad_range(zone, page));
	356
	357	zone->free_pages += order_size;
	358	while (order < MAX_ORDER-1) {
	359	unsigned long combined_idx;
	360	struct free_area *area;
	361	struct page *buddy;
	362
	363	buddy = __page_find_buddy(page, page_idx, order);
	364	if (!page_is_buddy(page, buddy, order))
	365	break; /* Move the buddy up one level. */
	366
	367	list_del(&buddy->lru);
	368	area = zone->free_area + order;
	369	area->nr_free--;
	370	rmv_page_order(buddy);
	371	combined_idx = __find_combined_index(page_idx, order);
	372	page = page + (combined_idx - page_idx);
	373	page_idx = combined_idx;
	374	order++;
	375	}
	376	set_page_order(page, order);
	377	list_add(&page->lru, &zone->free_area[order].free_list);
	378	zone->free_area[order].nr_free++;
	379	}
	380
	381	static inline int free_pages_check(struct page *page)
	382	{
	383	if (unlikely(page_mapcount(page) \|
	384	(page->mapping != NULL) \|
	385	(page_count(page) != 0) \|
	386	(page->flags & (
	387	1 << PG_lru \|
	388	1 << PG_private \|
	389	1 << PG_locked \|
	390	1 << PG_active \|
	391	1 << PG_reclaim \|
	392	1 << PG_slab \|
	393	1 << PG_swapcache \|
	394	1 << PG_writeback \|
	395	1 << PG_reserved \|
	396	1 << PG_buddy \|
	397	#ifdef CONFIG_X86_XEN
	398	1 << PG_pinned \|
	399	#endif
	400	1 << PG_foreign ))))
	401	bad_page(page);
	402	if (PageDirty(page))
	403	__ClearPageDirty(page);
	404	/*
	405	* For now, we report if PG_reserved was found set, but do not
	406	* clear it, and do not free the page. But we shall soon need
	407	* to do more, for when the ZERO_PAGE count wraps negative.
	408	*/
	409	return PageReserved(page);
	410	}
	411
	412	/*
	413	* Frees a list of pages.
	414	* Assumes all pages on list are in same zone, and of same order.
	415	* count is the number of pages to free.
	416	*
	417	* If the zone was previously in an "all pages pinned" state then look to
	418	* see if this freeing clears that state.
	419	*
	420	* And clear the zone's pages_scanned counter, to hold off the "all pages are
	421	* pinned" detection logic.
	422	*/
	423	static void free_pages_bulk(struct zone *zone, int count,
	424	struct list_head *list, int order)
	425	{
	426	spin_lock(&zone->lock);
	427	zone->all_unreclaimable = 0;
	428	zone->pages_scanned = 0;
	429	while (count--) {
	430	struct page *page;
	431
	432	BUG_ON(list_empty(list));
	433	page = list_entry(list->prev, struct page, lru);
	434	/* have to delete it as __free_one_page list manipulates */
	435	list_del(&page->lru);
	436	__free_one_page(page, zone, order);
	437	}
	438	spin_unlock(&zone->lock);
	439	}
	440
	441	static void free_one_page(struct zone zone, struct page page, int order)
	442	{
	443	LIST_HEAD(list);
	444	list_add(&page->lru, &list);
	445	free_pages_bulk(zone, 1, &list, order);
	446	}
	447
	448	static void __free_pages_ok(struct page *page, unsigned int order)
	449	{
	450	unsigned long flags;
	451	int i;
	452	int reserved = 0;
	453
	454	if (arch_free_page(page, order))
	455	return;
	456	if (!PageHighMem(page))
	457	debug_check_no_locks_freed(page_address(page),
	458	PAGE_SIZE<<order);
	459
	460	for (i = 0 ; i < (1 << order) ; ++i)
	461	reserved += free_pages_check(page + i);
	462	if (reserved)
	463	return;
	464
	465	kernel_map_pages(page, 1 << order, 0);
	466	local_irq_save(flags);
	467	__count_vm_events(PGFREE, 1 << order);
	468	free_one_page(page_zone(page), page, order);
	469	local_irq_restore(flags);
	470	}
	471
	472	/*
	473	* permit the bootmem allocator to evade page validation on high-order frees
	474	*/
	475	void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order)
	476	{
	477	if (order == 0) {
	478	__ClearPageReserved(page);
	479	set_page_count(page, 0);
	480	set_page_refcounted(page);
	481	__free_page(page);
	482	} else {
	483	int loop;
	484
	485	prefetchw(page);
	486	for (loop = 0; loop < BITS_PER_LONG; loop++) {
	487	struct page *p = &page[loop];
	488
	489	if (loop + 1 < BITS_PER_LONG)
	490	prefetchw(p + 1);
	491	__ClearPageReserved(p);
	492	set_page_count(p, 0);
	493	}
	494
	495	set_page_refcounted(page);
	496	__free_pages(page, order);
	497	}
	498	}
	499
	500
	501	/*
	502	* The order of subdivision here is critical for the IO subsystem.
	503	* Please do not alter this order without good reasons and regression
	504	* testing. Specifically, as large blocks of memory are subdivided,
	505	* the order in which smaller blocks are delivered depends on the order
	506	* they're subdivided in this function. This is the primary factor
	507	* influencing the order in which pages are delivered to the IO
	508	* subsystem according to empirical testing, and this is also justified
	509	* by considering the behavior of a buddy system containing a single
	510	* large block of memory acted on by a series of small allocations.
	511	* This behavior is a critical factor in sglist merging's success.
	512	*
	513	* -- wli
	514	*/
	515	static inline void expand(struct zone zone, struct page page,
	516	int low, int high, struct free_area *area)
	517	{
	518	unsigned long size = 1 << high;
	519
	520	while (high > low) {
	521	area--;
	522	high--;
	523	size >>= 1;
	524	BUG_ON(bad_range(zone, &page[size]));
	525	list_add(&page[size].lru, &area->free_list);
	526	area->nr_free++;
	527	set_page_order(&page[size], high);
	528	}
	529	}
	530
	531	/*
	532	* This page is about to be returned from the page allocator
	533	*/
	534	static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
	535	{
	536	if (unlikely(page_mapcount(page) \|
	537	(page->mapping != NULL) \|
	538	(page_count(page) != 0) \|
	539	(page->flags & (
	540	1 << PG_lru \|
	541	1 << PG_private \|
	542	1 << PG_locked \|
	543	1 << PG_active \|
	544	1 << PG_dirty \|
	545	1 << PG_reclaim \|
	546	1 << PG_slab \|
	547	1 << PG_swapcache \|
	548	1 << PG_writeback \|
	549	1 << PG_reserved \|
	550	1 << PG_buddy \|
	551	#ifdef CONFIG_X86_XEN
	552	1 << PG_pinned \|
	553	#endif
	554	1 << PG_foreign ))))
	555	bad_page(page);
	556
	557	/*
	558	* For now, we report if PG_reserved was found set, but do not
	559	* clear it, and do not allocate the page: as a safety net.
	560	*/
	561	if (PageReserved(page))
	562	return 1;
	563
	564	page->flags &= ~(1 << PG_uptodate \| 1 << PG_error \|
	565	1 << PG_referenced \| 1 << PG_arch_1 \|
	566	1 << PG_checked \| 1 << PG_mappedtodisk);
	567	set_page_private(page, 0);
	568	set_page_refcounted(page);
	569	kernel_map_pages(page, 1 << order, 1);
	570
	571	if (gfp_flags & __GFP_ZERO)
	572	prep_zero_page(page, order, gfp_flags);
	573
	574	if (order && (gfp_flags & __GFP_COMP))
	575	prep_compound_page(page, order);
	576
	577	return 0;
	578	}
	579
	580	/*
	581	* Do the hard work of removing an element from the buddy allocator.
	582	* Call me with the zone->lock already held.
	583	*/
	584	static struct page __rmqueue(struct zone zone, unsigned int order)
	585	{
	586	struct free_area * area;
	587	unsigned int current_order;
	588	struct page *page;
	589
	590	for (current_order = order; current_order < MAX_ORDER; ++current_order) {
	591	area = zone->free_area + current_order;
	592	if (list_empty(&area->free_list))
	593	continue;
	594
	595	page = list_entry(area->free_list.next, struct page, lru);
	596	list_del(&page->lru);
	597	rmv_page_order(page);
	598	area->nr_free--;
	599	zone->free_pages -= 1UL << order;
	600	expand(zone, page, order, current_order, area);
	601	return page;
	602	}
	603
	604	return NULL;
	605	}
	606
	607	/*
	608	* Obtain a specified number of elements from the buddy allocator, all under
	609	* a single hold of the lock, for efficiency. Add them to the supplied list.
	610	* Returns the number of new pages which were placed at *list.
	611	*/
	612	static int rmqueue_bulk(struct zone *zone, unsigned int order,
	613	unsigned long count, struct list_head *list)
	614	{
	615	int i;
	616
	617	spin_lock(&zone->lock);
	618	for (i = 0; i < count; ++i) {
	619	struct page *page = __rmqueue(zone, order);
	620	if (unlikely(page == NULL))
	621	break;
	622	list_add_tail(&page->lru, list);
	623	}
	624	spin_unlock(&zone->lock);
	625	return i;
	626	}
	627
	628	#ifdef CONFIG_NUMA
	629	/*
	630	* Called from the slab reaper to drain pagesets on a particular node that
	631	* belong to the currently executing processor.
	632	* Note that this function must be called with the thread pinned to
	633	* a single processor.
	634	*/
	635	void drain_node_pages(int nodeid)
	636	{
	637	int i, z;
	638	unsigned long flags;
	639
	640	for (z = 0; z < MAX_NR_ZONES; z++) {
	641	struct zone *zone = NODE_DATA(nodeid)->node_zones + z;
	642	struct per_cpu_pageset *pset;
	643
	644	pset = zone_pcp(zone, smp_processor_id());
	645	for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
	646	struct per_cpu_pages *pcp;
	647
	648	pcp = &pset->pcp[i];
	649	if (pcp->count) {
	650	local_irq_save(flags);
	651	free_pages_bulk(zone, pcp->count, &pcp->list, 0);
	652	pcp->count = 0;
	653	local_irq_restore(flags);
	654	}
	655	}
	656	}
	657	}
	658	#endif
	659
	660	#if defined(CONFIG_PM) \|\| defined(CONFIG_HOTPLUG_CPU)
	661	static void __drain_pages(unsigned int cpu)
	662	{
	663	unsigned long flags;
	664	struct zone *zone;
	665	int i;
	666
	667	for_each_zone(zone) {
	668	struct per_cpu_pageset *pset;
	669
	670	pset = zone_pcp(zone, cpu);
	671	for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
	672	struct per_cpu_pages *pcp;
	673
	674	pcp = &pset->pcp[i];
	675	local_irq_save(flags);
	676	free_pages_bulk(zone, pcp->count, &pcp->list, 0);
	677	pcp->count = 0;
	678	local_irq_restore(flags);
	679	}
	680	}
	681	}
	682	#endif /* CONFIG_PM \|\| CONFIG_HOTPLUG_CPU */
	683
	684	#ifdef CONFIG_PM
	685
	686	void mark_free_pages(struct zone *zone)
	687	{
	688	unsigned long zone_pfn, flags;
	689	int order;
	690	struct list_head *curr;
	691
	692	if (!zone->spanned_pages)
	693	return;
	694
	695	spin_lock_irqsave(&zone->lock, flags);
	696	for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
	697	ClearPageNosaveFree(pfn_to_page(zone_pfn + zone->zone_start_pfn));
	698
	699	for (order = MAX_ORDER - 1; order >= 0; --order)
	700	list_for_each(curr, &zone->free_area[order].free_list) {
	701	unsigned long start_pfn, i;
	702
	703	start_pfn = page_to_pfn(list_entry(curr, struct page, lru));
	704
	705	for (i=0; i < (1<<order); i++)
	706	SetPageNosaveFree(pfn_to_page(start_pfn+i));
	707	}
	708	spin_unlock_irqrestore(&zone->lock, flags);
	709	}
	710
	711	/*
	712	* Spill all of this CPU's per-cpu pages back into the buddy allocator.
	713	*/
	714	void drain_local_pages(void)
	715	{
	716	unsigned long flags;
	717
	718	local_irq_save(flags);
	719	__drain_pages(smp_processor_id());
	720	local_irq_restore(flags);
	721	}
	722	#endif /* CONFIG_PM */
	723
	724	/*
	725	* Free a 0-order page
	726	*/
	727	static void fastcall free_hot_cold_page(struct page *page, int cold)
	728	{
	729	struct zone *zone = page_zone(page);
	730	struct per_cpu_pages *pcp;
	731	unsigned long flags;
	732
	733	if (arch_free_page(page, 0))
	734	return;
	735
	736	if (PageAnon(page))
	737	page->mapping = NULL;
	738	if (free_pages_check(page))
	739	return;
	740
	741	kernel_map_pages(page, 1, 0);
	742
	743	pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
	744	local_irq_save(flags);
	745	__count_vm_event(PGFREE);
	746	list_add(&page->lru, &pcp->list);
	747	pcp->count++;
	748	if (pcp->count >= pcp->high) {
	749	free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
	750	pcp->count -= pcp->batch;
	751	}
	752	local_irq_restore(flags);
	753	put_cpu();
	754	}
	755
	756	void fastcall free_hot_page(struct page *page)
	757	{
	758	free_hot_cold_page(page, 0);
	759	}
	760
	761	void fastcall free_cold_page(struct page *page)
	762	{
	763	free_hot_cold_page(page, 1);
	764	}
	765
	766	/*
	767	* split_page takes a non-compound higher-order page, and splits it into
	768	* n (1<<order) sub-pages: page[0..n]
	769	* Each sub-page must be freed individually.
	770	*
	771	* Note: this is probably too low level an operation for use in drivers.
	772	* Please consult with lkml before using this in your driver.
	773	*/
	774	void split_page(struct page *page, unsigned int order)
	775	{
	776	int i;
	777
	778	BUG_ON(PageCompound(page));
	779	BUG_ON(!page_count(page));
	780	for (i = 1; i < (1 << order); i++)
	781	set_page_refcounted(page + i);
	782	}
	783
	784	/*
	785	* Really, prep_compound_page() should be called from __rmqueue_bulk(). But
	786	* we cheat by calling it from here, in the order > 0 path. Saves a branch
	787	* or two.
	788	*/
	789	static struct page buffered_rmqueue(struct zonelist zonelist,
	790	struct zone *zone, int order, gfp_t gfp_flags)
	791	{
	792	unsigned long flags;
	793	struct page *page;
	794	int cold = !!(gfp_flags & __GFP_COLD);
	795	int cpu;
	796
	797	again:
	798	cpu = get_cpu();
	799	if (likely(order == 0)) {
	800	struct per_cpu_pages *pcp;
	801
	802	pcp = &zone_pcp(zone, cpu)->pcp[cold];
	803	local_irq_save(flags);
	804	if (!pcp->count) {
	805	pcp->count += rmqueue_bulk(zone, 0,
	806	pcp->batch, &pcp->list);
	807	if (unlikely(!pcp->count))
	808	goto failed;
	809	}
	810	page = list_entry(pcp->list.next, struct page, lru);
	811	list_del(&page->lru);
	812	pcp->count--;
	813	} else {
	814	spin_lock_irqsave(&zone->lock, flags);
	815	page = __rmqueue(zone, order);
	816	spin_unlock(&zone->lock);
	817	if (!page)
	818	goto failed;
	819	}
	820
	821	__count_zone_vm_events(PGALLOC, zone, 1 << order);
	822	zone_statistics(zonelist, zone);
	823	local_irq_restore(flags);
	824	put_cpu();
	825
	826	BUG_ON(bad_range(zone, page));
	827	if (prep_new_page(page, order, gfp_flags))
	828	goto again;
	829	return page;
	830
	831	failed:
	832	local_irq_restore(flags);
	833	put_cpu();
	834	return NULL;
	835	}
	836
	837	#define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */
	838	#define ALLOC_WMARK_MIN 0x02 /* use pages_min watermark */
	839	#define ALLOC_WMARK_LOW 0x04 /* use pages_low watermark */
	840	#define ALLOC_WMARK_HIGH 0x08 /* use pages_high watermark */
	841	#define ALLOC_HARDER 0x10 /* try to alloc harder */
	842	#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
	843	#define ALLOC_CPUSET 0x40 /* check for correct cpuset */
	844
	845	/*
	846	* Return 1 if free pages are above 'mark'. This takes into account the order
	847	* of the allocation.
	848	*/
	849	int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
	850	int classzone_idx, int alloc_flags)
	851	{
	852	/* free_pages my go negative - that's OK */
	853	long min = mark, free_pages = z->free_pages - (1 << order) + 1;
	854	int o;
	855
	856	if (alloc_flags & ALLOC_HIGH)
	857	min -= min / 2;
	858	if (alloc_flags & ALLOC_HARDER)
	859	min -= min / 4;
	860
	861	if (free_pages <= min + z->lowmem_reserve[classzone_idx])
	862	return 0;
	863	for (o = 0; o < order; o++) {
	864	/* At the next order, this order's pages become unavailable */
	865	free_pages -= z->free_area[o].nr_free << o;
	866
	867	/* Require fewer higher order pages to be free */
	868	min >>= 1;
	869
	870	if (free_pages <= min)
	871	return 0;
	872	}
	873	return 1;
	874	}
	875
	876	/*
	877	* get_page_from_freeliest goes through the zonelist trying to allocate
	878	* a page.
	879	*/
	880	static struct page *
	881	get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
	882	struct zonelist *zonelist, int alloc_flags)
	883	{
	884	struct zone **z = zonelist->zones;
	885	struct page *page = NULL;
	886	int classzone_idx = zone_idx(*z);
	887
	888	/*
	889	* Go through the zonelist once, looking for a zone with enough free.
	890	* See also cpuset_zone_allowed() comment in kernel/cpuset.c.
	891	*/
	892	do {
	893	if ((alloc_flags & ALLOC_CPUSET) &&
	894	!cpuset_zone_allowed(*z, gfp_mask))
	895	continue;
	896
	897	if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
	898	unsigned long mark;
	899	if (alloc_flags & ALLOC_WMARK_MIN)
	900	mark = (*z)->pages_min;
	901	else if (alloc_flags & ALLOC_WMARK_LOW)
	902	mark = (*z)->pages_low;
	903	else
	904	mark = (*z)->pages_high;
	905	if (!zone_watermark_ok(*z, order, mark,
	906	classzone_idx, alloc_flags))
	907	if (!zone_reclaim_mode \|\|
	908	!zone_reclaim(*z, gfp_mask, order))
	909	continue;
	910	}
	911
	912	page = buffered_rmqueue(zonelist, *z, order, gfp_mask);
	913	if (page) {
	914	break;
	915	}
	916	} while (*(++z) != NULL);
	917	return page;
	918	}
	919
	920	/*
	921	* This is the 'heart' of the zoned buddy allocator.
	922	*/
	923	struct page * fastcall
	924	__alloc_pages(gfp_t gfp_mask, unsigned int order,
	925	struct zonelist *zonelist)
	926	{
	927	const gfp_t wait = gfp_mask & __GFP_WAIT;
	928	struct zone **z;
	929	struct page *page;
	930	struct reclaim_state reclaim_state;
	931	struct task_struct *p = current;
	932	int do_retry;
	933	int alloc_flags;
	934	int did_some_progress;
	935
	936	might_sleep_if(wait);
	937
	938	restart:
	939	z = zonelist->zones; /* the list of zones suitable for gfp_mask */
	940
	941	if (unlikely(*z == NULL)) {
	942	/* Should this ever happen?? */
	943	return NULL;
	944	}
	945
	946	page = get_page_from_freelist(gfp_mask\|__GFP_HARDWALL, order,
	947	zonelist, ALLOC_WMARK_LOW\|ALLOC_CPUSET);
	948	if (page)
	949	goto got_pg;
	950
	951	do {
	952	wakeup_kswapd(*z, order);
	953	} while (*(++z));
	954
	955	/*
	956	* OK, we're below the kswapd watermark and have kicked background
	957	* reclaim. Now things get more complex, so set up alloc_flags according
	958	* to how we want to proceed.
	959	*
	960	* The caller may dip into page reserves a bit more if the caller
	961	* cannot run direct reclaim, or if the caller has realtime scheduling
	962	* policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
	963	* set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
	964	*/
	965	alloc_flags = ALLOC_WMARK_MIN;
	966	if ((unlikely(rt_task(p)) && !in_interrupt()) \|\| !wait)
	967	alloc_flags \|= ALLOC_HARDER;
	968	if (gfp_mask & __GFP_HIGH)
	969	alloc_flags \|= ALLOC_HIGH;
	970	if (wait)
	971	alloc_flags \|= ALLOC_CPUSET;
	972
	973	/*
	974	* Go through the zonelist again. Let __GFP_HIGH and allocations
	975	* coming from realtime tasks go deeper into reserves.
	976	*
	977	* This is the last chance, in general, before the goto nopage.
	978	* Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
	979	* See also cpuset_zone_allowed() comment in kernel/cpuset.c.
	980	*/
	981	page = get_page_from_freelist(gfp_mask, order, zonelist, alloc_flags);
	982	if (page)
	983	goto got_pg;
	984
	985	/* This allocation should allow future memory freeing. */
	986
	987	if (((p->flags & PF_MEMALLOC) \|\| unlikely(test_thread_flag(TIF_MEMDIE)))
	988	&& !in_interrupt()) {
	989	if (!(gfp_mask & __GFP_NOMEMALLOC)) {
	990	nofail_alloc:
	991	/* go through the zonelist yet again, ignoring mins */
	992	page = get_page_from_freelist(gfp_mask, order,
	993	zonelist, ALLOC_NO_WATERMARKS);
	994	if (page)
	995	goto got_pg;
	996	if (gfp_mask & __GFP_NOFAIL) {
	997	blk_congestion_wait(WRITE, HZ/50);
	998	goto nofail_alloc;
	999	}
	1000	}
	1001	goto nopage;
	1002	}
	1003
	1004	/* Atomic allocations - we can't balance anything */
	1005	if (!wait)
	1006	goto nopage;
	1007
	1008	rebalance:
	1009	cond_resched();
	1010
	1011	/* We now go into synchronous reclaim */
	1012	cpuset_memory_pressure_bump();
	1013	p->flags \|= PF_MEMALLOC;
	1014	reclaim_state.reclaimed_slab = 0;
	1015	p->reclaim_state = &reclaim_state;
	1016
	1017	did_some_progress = try_to_free_pages(zonelist->zones, gfp_mask);
	1018
	1019	p->reclaim_state = NULL;
	1020	p->flags &= ~PF_MEMALLOC;
	1021
	1022	cond_resched();
	1023
	1024	if (likely(did_some_progress)) {
	1025	page = get_page_from_freelist(gfp_mask, order,
	1026	zonelist, alloc_flags);
	1027	if (page)
	1028	goto got_pg;
	1029	} else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
	1030	/*
	1031	* Go through the zonelist yet one more time, keep
	1032	* very high watermark here, this is only to catch
	1033	* a parallel oom killing, we must fail if we're still
	1034	* under heavy pressure.
	1035	*/
	1036	page = get_page_from_freelist(gfp_mask\|__GFP_HARDWALL, order,
	1037	zonelist, ALLOC_WMARK_HIGH\|ALLOC_CPUSET);
	1038	if (page)
	1039	goto got_pg;
	1040
	1041	out_of_memory(zonelist, gfp_mask, order);
	1042	goto restart;
	1043	}
	1044
	1045	/*
	1046	* Don't let big-order allocations loop unless the caller explicitly
	1047	* requests that. Wait for some write requests to complete then retry.
	1048	*
	1049	* In this implementation, __GFP_REPEAT means __GFP_NOFAIL for order
	1050	* <= 3, but that may not be true in other implementations.
	1051	*/
	1052	do_retry = 0;
	1053	if (!(gfp_mask & __GFP_NORETRY)) {
	1054	if ((order <= 3) \|\| (gfp_mask & __GFP_REPEAT))
	1055	do_retry = 1;
	1056	if (gfp_mask & __GFP_NOFAIL)
	1057	do_retry = 1;
	1058	}
	1059	if (do_retry) {
	1060	blk_congestion_wait(WRITE, HZ/50);
	1061	goto rebalance;
	1062	}
	1063
	1064	nopage:
	1065	if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {
	1066	printk(KERN_WARNING "%s: page allocation failure."
	1067	" order:%d, mode:0x%x\n",
	1068	p->comm, order, gfp_mask);
	1069	dump_stack();
	1070	show_mem();
	1071	}
	1072	got_pg:
	1073	return page;
	1074	}
	1075
	1076	EXPORT_SYMBOL(__alloc_pages);
	1077
	1078	/*
	1079	* Common helper functions.
	1080	*/
	1081	fastcall unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
	1082	{
	1083	struct page * page;
	1084	page = alloc_pages(gfp_mask, order);
	1085	if (!page)
	1086	return 0;
	1087	return (unsigned long) page_address(page);
	1088	}
	1089
	1090	EXPORT_SYMBOL(__get_free_pages);
	1091
	1092	fastcall unsigned long get_zeroed_page(gfp_t gfp_mask)
	1093	{
	1094	struct page * page;
	1095
	1096	/*
	1097	* get_zeroed_page() returns a 32-bit address, which cannot represent
	1098	* a highmem page
	1099	*/
	1100	BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
	1101
	1102	page = alloc_pages(gfp_mask \| __GFP_ZERO, 0);
	1103	if (page)
	1104	return (unsigned long) page_address(page);
	1105	return 0;
	1106	}
	1107
	1108	EXPORT_SYMBOL(get_zeroed_page);
	1109
	1110	void __pagevec_free(struct pagevec *pvec)
	1111	{
	1112	int i = pagevec_count(pvec);
	1113
	1114	while (--i >= 0)
	1115	free_hot_cold_page(pvec->pages[i], pvec->cold);
	1116	}
	1117
	1118	fastcall void __free_pages(struct page *page, unsigned int order)
	1119	{
	1120	if (put_page_testzero(page)) {
	1121	if (order == 0)
	1122	free_hot_page(page);
	1123	else
	1124	__free_pages_ok(page, order);
	1125	}
	1126	}
	1127
	1128	EXPORT_SYMBOL(__free_pages);
	1129
	1130	fastcall void free_pages(unsigned long addr, unsigned int order)
	1131	{
	1132	if (addr != 0) {
	1133	BUG_ON(!virt_addr_valid((void *)addr));
	1134	__free_pages(virt_to_page((void *)addr), order);
	1135	}
	1136	}
	1137
	1138	EXPORT_SYMBOL(free_pages);
	1139
	1140	/*
	1141	* Total amount of free (allocatable) RAM:
	1142	*/
	1143	unsigned int nr_free_pages(void)
	1144	{
	1145	unsigned int sum = 0;
	1146	struct zone *zone;
	1147
	1148	for_each_zone(zone)
	1149	sum += zone->free_pages;
	1150
	1151	return sum;
	1152	}
	1153
	1154	EXPORT_SYMBOL(nr_free_pages);
	1155
	1156	#ifdef CONFIG_NUMA
	1157	unsigned int nr_free_pages_pgdat(pg_data_t *pgdat)
	1158	{
	1159	unsigned int i, sum = 0;
	1160
	1161	for (i = 0; i < MAX_NR_ZONES; i++)
	1162	sum += pgdat->node_zones[i].free_pages;
	1163
	1164	return sum;
	1165	}
	1166	#endif
	1167
	1168	static unsigned int nr_free_zone_pages(int offset)
	1169	{
	1170	/* Just pick one node, since fallback list is circular */
	1171	pg_data_t *pgdat = NODE_DATA(numa_node_id());
	1172	unsigned int sum = 0;
	1173
	1174	struct zonelist *zonelist = pgdat->node_zonelists + offset;
	1175	struct zone **zonep = zonelist->zones;
	1176	struct zone *zone;
	1177
	1178	for (zone = zonep++; zone; zone = zonep++) {
	1179	unsigned long size = zone->present_pages;
	1180	unsigned long high = zone->pages_high;
	1181	if (size > high)
	1182	sum += size - high;
	1183	}
	1184
	1185	return sum;
	1186	}
	1187
	1188	/*
	1189	* Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL
	1190	*/
	1191	unsigned int nr_free_buffer_pages(void)
	1192	{
	1193	return nr_free_zone_pages(gfp_zone(GFP_USER));
	1194	}
	1195
	1196	/*
	1197	* Amount of free RAM allocatable within all zones
	1198	*/
	1199	unsigned int nr_free_pagecache_pages(void)
	1200	{
	1201	return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER));
	1202	}
	1203
	1204	#ifdef CONFIG_HIGHMEM
	1205	unsigned int nr_free_highpages (void)
	1206	{
	1207	pg_data_t *pgdat;
	1208	unsigned int pages = 0;
	1209
	1210	for_each_online_pgdat(pgdat)
	1211	pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages;
	1212
	1213	return pages;
	1214	}
	1215	#endif
	1216
	1217	#ifdef CONFIG_NUMA
	1218	static void show_node(struct zone *zone)
	1219	{
	1220	printk("Node %d ", zone->zone_pgdat->node_id);
	1221	}
	1222	#else
	1223	#define show_node(zone) do { } while (0)
	1224	#endif
	1225
	1226	void si_meminfo(struct sysinfo *val)
	1227	{
	1228	val->totalram = totalram_pages;
	1229	val->sharedram = 0;
	1230	val->freeram = nr_free_pages();
	1231	val->bufferram = nr_blockdev_pages();
	1232	#ifdef CONFIG_HIGHMEM
	1233	val->totalhigh = totalhigh_pages;
	1234	val->freehigh = nr_free_highpages();
	1235	#else
	1236	val->totalhigh = 0;
	1237	val->freehigh = 0;
	1238	#endif
	1239	val->mem_unit = PAGE_SIZE;
	1240	}
	1241
	1242	EXPORT_SYMBOL(si_meminfo);
	1243
	1244	#ifdef CONFIG_NUMA
	1245	void si_meminfo_node(struct sysinfo *val, int nid)
	1246	{
	1247	pg_data_t *pgdat = NODE_DATA(nid);
	1248
	1249	val->totalram = pgdat->node_present_pages;
	1250	val->freeram = nr_free_pages_pgdat(pgdat);
	1251	val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages;
	1252	val->freehigh = pgdat->node_zones[ZONE_HIGHMEM].free_pages;
	1253	val->mem_unit = PAGE_SIZE;
	1254	}
	1255	#endif
	1256
	1257	#define K(x) ((x) << (PAGE_SHIFT-10))
	1258
	1259	/*
	1260	* Show free area list (used inside shift_scroll-lock stuff)
	1261	* We also calculate the percentage fragmentation. We do this by counting the
	1262	* memory on each free list with the exception of the first item on the list.
	1263	*/
	1264	void show_free_areas(void)
	1265	{
	1266	int cpu, temperature;
	1267	unsigned long active;
	1268	unsigned long inactive;
	1269	unsigned long free;
	1270	struct zone *zone;
	1271
	1272	for_each_zone(zone) {
	1273	show_node(zone);
	1274	printk("%s per-cpu:", zone->name);
	1275
	1276	if (!populated_zone(zone)) {
	1277	printk(" empty\n");
	1278	continue;
	1279	} else
	1280	printk("\n");
	1281
	1282	for_each_online_cpu(cpu) {
	1283	struct per_cpu_pageset *pageset;
	1284
	1285	pageset = zone_pcp(zone, cpu);
	1286
	1287	for (temperature = 0; temperature < 2; temperature++)
	1288	printk("cpu %d %s: high %d, batch %d used:%d\n",
	1289	cpu,
	1290	temperature ? "cold" : "hot",
	1291	pageset->pcp[temperature].high,
	1292	pageset->pcp[temperature].batch,
	1293	pageset->pcp[temperature].count);
	1294	}
	1295	}
	1296
	1297	get_zone_counts(&active, &inactive, &free);
	1298
	1299	printk("Free pages: %11ukB (%ukB HighMem)\n",
	1300	K(nr_free_pages()),
	1301	K(nr_free_highpages()));
	1302
	1303	printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu "
	1304	"unstable:%lu free:%u slab:%lu mapped:%lu pagetables:%lu\n",
	1305	active,
	1306	inactive,
	1307	global_page_state(NR_FILE_DIRTY),
	1308	global_page_state(NR_WRITEBACK),
	1309	global_page_state(NR_UNSTABLE_NFS),
	1310	nr_free_pages(),
	1311	global_page_state(NR_SLAB),
	1312	global_page_state(NR_FILE_MAPPED),
	1313	global_page_state(NR_PAGETABLE));
	1314
	1315	for_each_zone(zone) {
	1316	int i;
	1317
	1318	show_node(zone);
	1319	printk("%s"
	1320	" free:%lukB"
	1321	" min:%lukB"
	1322	" low:%lukB"
	1323	" high:%lukB"
	1324	" active:%lukB"
	1325	" inactive:%lukB"
	1326	" present:%lukB"
	1327	" pages_scanned:%lu"
	1328	" all_unreclaimable? %s"
	1329	"\n",
	1330	zone->name,
	1331	K(zone->free_pages),
	1332	K(zone->pages_min),
	1333	K(zone->pages_low),
	1334	K(zone->pages_high),
	1335	K(zone->nr_active),
	1336	K(zone->nr_inactive),
	1337	K(zone->present_pages),
	1338	zone->pages_scanned,
	1339	(zone->all_unreclaimable ? "yes" : "no")
	1340	);
	1341	printk("lowmem_reserve[]:");
	1342	for (i = 0; i < MAX_NR_ZONES; i++)
	1343	printk(" %lu", zone->lowmem_reserve[i]);
	1344	printk("\n");
	1345	}
	1346
	1347	for_each_zone(zone) {
	1348	unsigned long nr[MAX_ORDER], flags, order, total = 0;
	1349
	1350	show_node(zone);
	1351	printk("%s: ", zone->name);
	1352	if (!populated_zone(zone)) {
	1353	printk("empty\n");
	1354	continue;
	1355	}
	1356
	1357	spin_lock_irqsave(&zone->lock, flags);
	1358	for (order = 0; order < MAX_ORDER; order++) {
	1359	nr[order] = zone->free_area[order].nr_free;
	1360	total += nr[order] << order;
	1361	}
	1362	spin_unlock_irqrestore(&zone->lock, flags);
	1363	for (order = 0; order < MAX_ORDER; order++)
	1364	printk("%lu*%lukB ", nr[order], K(1UL) << order);
	1365	printk("= %lukB\n", K(total));
	1366	}
	1367
	1368	show_swap_cache_info();
	1369	}
	1370
	1371	/*
	1372	* Builds allocation fallback zone lists.
	1373	*
	1374	* Add all populated zones of a node to the zonelist.
	1375	*/
	1376	static int __meminit build_zonelists_node(pg_data_t *pgdat,
	1377	struct zonelist *zonelist, int nr_zones, int zone_type)
	1378	{
	1379	struct zone *zone;
	1380
	1381	BUG_ON(zone_type > ZONE_HIGHMEM);
	1382
	1383	do {
	1384	zone = pgdat->node_zones + zone_type;
	1385	if (populated_zone(zone)) {
	1386	#ifndef CONFIG_HIGHMEM
	1387	BUG_ON(zone_type > ZONE_NORMAL);
	1388	#endif
	1389	zonelist->zones[nr_zones++] = zone;
	1390	check_highest_zone(zone_type);
	1391	}
	1392	zone_type--;
	1393
	1394	} while (zone_type >= 0);
	1395	return nr_zones;
	1396	}
	1397
	1398	static inline int highest_zone(int zone_bits)
	1399	{
	1400	int res = ZONE_NORMAL;
	1401	if (zone_bits & (__force int)__GFP_HIGHMEM)
	1402	res = ZONE_HIGHMEM;
	1403	if (zone_bits & (__force int)__GFP_DMA32)
	1404	res = ZONE_DMA32;
	1405	if (zone_bits & (__force int)__GFP_DMA)
	1406	res = ZONE_DMA;
	1407	return res;
	1408	}
	1409
	1410	#ifdef CONFIG_NUMA
	1411	#define MAX_NODE_LOAD (num_online_nodes())
	1412	static int __meminitdata node_load[MAX_NUMNODES];
	1413	/**
	1414	* find_next_best_node - find the next node that should appear in a given node's fallback list
	1415	* @node: node whose fallback list we're appending
	1416	* @used_node_mask: nodemask_t of already used nodes
	1417	*
	1418	* We use a number of factors to determine which is the next node that should
	1419	* appear on a given node's fallback list. The node should not have appeared
	1420	* already in @node's fallback list, and it should be the next closest node
	1421	* according to the distance array (which contains arbitrary distance values
	1422	* from each node to each node in the system), and should also prefer nodes
	1423	* with no CPUs, since presumably they'll have very little allocation pressure
	1424	* on them otherwise.
	1425	* It returns -1 if no node is found.
	1426	*/
	1427	static int __meminit find_next_best_node(int node, nodemask_t *used_node_mask)
	1428	{
	1429	int n, val;
	1430	int min_val = INT_MAX;
	1431	int best_node = -1;
	1432
	1433	/* Use the local node if we haven't already */
	1434	if (!node_isset(node, *used_node_mask)) {
	1435	node_set(node, *used_node_mask);
	1436	return node;
	1437	}
	1438
	1439	for_each_online_node(n) {
	1440	cpumask_t tmp;
	1441
	1442	/* Don't want a node to appear more than once */
	1443	if (node_isset(n, *used_node_mask))
	1444	continue;
	1445
	1446	/* Use the distance array to find the distance */
	1447	val = node_distance(node, n);
	1448
	1449	/* Penalize nodes under us ("prefer the next node") */
	1450	val += (n < node);
	1451
	1452	/* Give preference to headless and unused nodes */
	1453	tmp = node_to_cpumask(n);
	1454	if (!cpus_empty(tmp))
	1455	val += PENALTY_FOR_NODE_WITH_CPUS;
	1456
	1457	/* Slight preference for less loaded node */
	1458	val = (MAX_NODE_LOADMAX_NUMNODES);
	1459	val += node_load[n];
	1460
	1461	if (val < min_val) {
	1462	min_val = val;
	1463	best_node = n;
	1464	}
	1465	}
	1466
	1467	if (best_node >= 0)
	1468	node_set(best_node, *used_node_mask);
	1469
	1470	return best_node;
	1471	}
	1472
	1473	static void __meminit build_zonelists(pg_data_t *pgdat)
	1474	{
	1475	int i, j, k, node, local_node;
	1476	int prev_node, load;
	1477	struct zonelist *zonelist;
	1478	nodemask_t used_mask;
	1479
	1480	/* initialize zonelists */
	1481	for (i = 0; i < GFP_ZONETYPES; i++) {
	1482	zonelist = pgdat->node_zonelists + i;
	1483	zonelist->zones[0] = NULL;
	1484	}
	1485
	1486	/* NUMA-aware ordering of nodes */
	1487	local_node = pgdat->node_id;
	1488	load = num_online_nodes();
	1489	prev_node = local_node;
	1490	nodes_clear(used_mask);
	1491	while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
	1492	int distance = node_distance(local_node, node);
	1493
	1494	/*
	1495	* If another node is sufficiently far away then it is better
	1496	* to reclaim pages in a zone before going off node.
	1497	*/
	1498	if (distance > RECLAIM_DISTANCE)
	1499	zone_reclaim_mode = 1;
	1500
	1501	/*
	1502	* We don't want to pressure a particular node.
	1503	* So adding penalty to the first node in same
	1504	* distance group to make it round-robin.
	1505	*/
	1506
	1507	if (distance != node_distance(local_node, prev_node))
	1508	node_load[node] += load;
	1509	prev_node = node;
	1510	load--;
	1511	for (i = 0; i < GFP_ZONETYPES; i++) {
	1512	zonelist = pgdat->node_zonelists + i;
	1513	for (j = 0; zonelist->zones[j] != NULL; j++);
	1514
	1515	k = highest_zone(i);
	1516
	1517	j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
	1518	zonelist->zones[j] = NULL;
	1519	}
	1520	}
	1521	}
	1522
	1523	#else /* CONFIG_NUMA */
	1524
	1525	static void __meminit build_zonelists(pg_data_t *pgdat)
	1526	{
	1527	int i, j, k, node, local_node;
	1528
	1529	local_node = pgdat->node_id;
	1530	for (i = 0; i < GFP_ZONETYPES; i++) {
	1531	struct zonelist *zonelist;
	1532
	1533	zonelist = pgdat->node_zonelists + i;
	1534
	1535	j = 0;
	1536	k = highest_zone(i);
	1537	j = build_zonelists_node(pgdat, zonelist, j, k);
	1538	/*
	1539	* Now we build the zonelist so that it contains the zones
	1540	* of all the other nodes.
	1541	* We don't want to pressure a particular node, so when
	1542	* building the zones for node N, we make sure that the
	1543	* zones coming right after the local ones are those from
	1544	* node N+1 (modulo N)
	1545	*/
	1546	for (node = local_node + 1; node < MAX_NUMNODES; node++) {
	1547	if (!node_online(node))
	1548	continue;
	1549	j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
	1550	}
	1551	for (node = 0; node < local_node; node++) {
	1552	if (!node_online(node))
	1553	continue;
	1554	j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
	1555	}
	1556
	1557	zonelist->zones[j] = NULL;
	1558	}
	1559	}
	1560
	1561	#endif /* CONFIG_NUMA */
	1562
	1563	/* return values int ....just for stop_machine_run() */
	1564	static int __meminit __build_all_zonelists(void *dummy)
	1565	{
	1566	int nid;
	1567	for_each_online_node(nid)
	1568	build_zonelists(NODE_DATA(nid));
	1569	return 0;
	1570	}
	1571
	1572	void __meminit build_all_zonelists(void)
	1573	{
	1574	if (system_state == SYSTEM_BOOTING) {
	1575	__build_all_zonelists(0);
	1576	cpuset_init_current_mems_allowed();
	1577	} else {
	1578	/* we have to stop all cpus to guaranntee there is no user
	1579	of zonelist */
	1580	stop_machine_run(__build_all_zonelists, NULL, NR_CPUS);
	1581	/* cpuset refresh routine should be here */
	1582	}
	1583	vm_total_pages = nr_free_pagecache_pages();
	1584	printk("Built %i zonelists. Total pages: %ld\n",
	1585	num_online_nodes(), vm_total_pages);
	1586	}
	1587
	1588	/*
	1589	* Helper functions to size the waitqueue hash table.
	1590	* Essentially these want to choose hash table sizes sufficiently
	1591	* large so that collisions trying to wait on pages are rare.
	1592	* But in fact, the number of active page waitqueues on typical
	1593	* systems is ridiculously low, less than 200. So this is even
	1594	* conservative, even though it seems large.
	1595	*
	1596	* The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
	1597	* waitqueues, i.e. the size of the waitq table given the number of pages.
	1598	*/
	1599	#define PAGES_PER_WAITQUEUE 256
	1600
	1601	#ifndef CONFIG_MEMORY_HOTPLUG
	1602	static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
	1603	{
	1604	unsigned long size = 1;
	1605
	1606	pages /= PAGES_PER_WAITQUEUE;
	1607
	1608	while (size < pages)
	1609	size <<= 1;
	1610
	1611	/*
	1612	* Once we have dozens or even hundreds of threads sleeping
	1613	* on IO we've got bigger problems than wait queue collision.
	1614	* Limit the size of the wait table to a reasonable size.
	1615	*/
	1616	size = min(size, 4096UL);
	1617
	1618	return max(size, 4UL);
	1619	}
	1620	#else
	1621	/*
	1622	* A zone's size might be changed by hot-add, so it is not possible to determine
	1623	* a suitable size for its wait_table. So we use the maximum size now.
	1624	*
	1625	* The max wait table size = 4096 x sizeof(wait_queue_head_t). ie:
	1626	*
	1627	* i386 (preemption config) : 4096 x 16 = 64Kbyte.
	1628	* ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte.
	1629	* ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte.
	1630	*
	1631	* The maximum entries are prepared when a zone's memory is (512K + 256) pages
	1632	* or more by the traditional way. (See above). It equals:
	1633	*
	1634	* i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte.
	1635	* ia64(16K page size) : = ( 8G + 4M)byte.
	1636	* powerpc (64K page size) : = (32G +16M)byte.
	1637	*/
	1638	static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
	1639	{
	1640	return 4096UL;
	1641	}
	1642	#endif
	1643
	1644	/*
	1645	* This is an integer logarithm so that shifts can be used later
	1646	* to extract the more random high bits from the multiplicative
	1647	* hash function before the remainder is taken.
	1648	*/
	1649	static inline unsigned long wait_table_bits(unsigned long size)
	1650	{
	1651	return ffz(~size);
	1652	}
	1653
	1654	#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
	1655
	1656	static void __init calculate_zone_totalpages(struct pglist_data *pgdat,
	1657	unsigned long zones_size, unsigned long zholes_size)
	1658	{
	1659	unsigned long realtotalpages, totalpages = 0;
	1660	int i;
	1661
	1662	for (i = 0; i < MAX_NR_ZONES; i++)
	1663	totalpages += zones_size[i];
	1664	pgdat->node_spanned_pages = totalpages;
	1665
	1666	realtotalpages = totalpages;
	1667	if (zholes_size)
	1668	for (i = 0; i < MAX_NR_ZONES; i++)
	1669	realtotalpages -= zholes_size[i];
	1670	pgdat->node_present_pages = realtotalpages;
	1671	printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages);
	1672	}
	1673
	1674
	1675	/*
	1676	* Initially all pages are reserved - free ones are freed
	1677	* up by free_all_bootmem() once the early boot process is
	1678	* done. Non-atomic initialization, single-pass.
	1679	*/
	1680	void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
	1681	unsigned long start_pfn)
	1682	{
	1683	struct page *page;
	1684	unsigned long end_pfn = start_pfn + size;
	1685	unsigned long pfn;
	1686
	1687	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
	1688	if (!early_pfn_valid(pfn))
	1689	continue;
	1690	page = pfn_to_page(pfn);
	1691	set_page_links(page, zone, nid, pfn);
	1692	init_page_count(page);
	1693	reset_page_mapcount(page);
	1694	SetPageReserved(page);
	1695	INIT_LIST_HEAD(&page->lru);
	1696	#ifdef WANT_PAGE_VIRTUAL
	1697	/* The shift won't overflow because ZONE_NORMAL is below 4G. */
	1698	if (!is_highmem_idx(zone))
	1699	set_page_address(page, __va(pfn << PAGE_SHIFT));
	1700	#endif
	1701	}
	1702	}
	1703
	1704	void zone_init_free_lists(struct pglist_data pgdat, struct zone zone,
	1705	unsigned long size)
	1706	{
	1707	int order;
	1708	for (order = 0; order < MAX_ORDER ; order++) {
	1709	INIT_LIST_HEAD(&zone->free_area[order].free_list);
	1710	zone->free_area[order].nr_free = 0;
	1711	}
	1712	}
	1713
	1714	#define ZONETABLE_INDEX(x, zone_nr) ((x << ZONES_SHIFT) \| zone_nr)
	1715	void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn,
	1716	unsigned long size)
	1717	{
	1718	unsigned long snum = pfn_to_section_nr(pfn);
	1719	unsigned long end = pfn_to_section_nr(pfn + size);
	1720
	1721	if (FLAGS_HAS_NODE)
	1722	zone_table[ZONETABLE_INDEX(nid, zid)] = zone;
	1723	else
	1724	for (; snum <= end; snum++)
	1725	zone_table[ZONETABLE_INDEX(snum, zid)] = zone;
	1726	}
	1727
	1728	#ifndef __HAVE_ARCH_MEMMAP_INIT
	1729	#define memmap_init(size, nid, zone, start_pfn) \
	1730	memmap_init_zone((size), (nid), (zone), (start_pfn))
	1731	#endif
	1732
	1733	static int __cpuinit zone_batchsize(struct zone *zone)
	1734	{
	1735	int batch;
	1736
	1737	/*
	1738	* The per-cpu-pages pools are set to around 1000th of the
	1739	* size of the zone. But no more than 1/2 of a meg.
	1740	*
	1741	* OK, so we don't know how big the cache is. So guess.
	1742	*/
	1743	batch = zone->present_pages / 1024;
	1744	if (batch * PAGE_SIZE > 512 * 1024)
	1745	batch = (512 * 1024) / PAGE_SIZE;
	1746	batch /= 4; /* We effectively = 4 below /
	1747	if (batch < 1)
	1748	batch = 1;
	1749
	1750	/*
	1751	* Clamp the batch to a 2^n - 1 value. Having a power
	1752	* of 2 value was found to be more likely to have
	1753	* suboptimal cache aliasing properties in some cases.
	1754	*
	1755	* For example if 2 tasks are alternately allocating
	1756	* batches of pages, one task can end up with a lot
	1757	* of pages of one half of the possible page colors
	1758	* and the other with pages of the other colors.
	1759	*/
	1760	batch = (1 << (fls(batch + batch/2)-1)) - 1;
	1761
	1762	return batch;
	1763	}
	1764
	1765	inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
	1766	{
	1767	struct per_cpu_pages *pcp;
	1768
	1769	memset(p, 0, sizeof(*p));
	1770
	1771	pcp = &p->pcp[0]; /* hot */
	1772	pcp->count = 0;
	1773	pcp->high = 6 * batch;
	1774	pcp->batch = max(1UL, 1 * batch);
	1775	INIT_LIST_HEAD(&pcp->list);
	1776
	1777	pcp = &p->pcp[1]; /* cold*/
	1778	pcp->count = 0;
	1779	pcp->high = 2 * batch;
	1780	pcp->batch = max(1UL, batch/2);
	1781	INIT_LIST_HEAD(&pcp->list);
	1782	}
	1783
	1784	/*
	1785	* setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist
	1786	* to the value high for the pageset p.
	1787	*/
	1788
	1789	static void setup_pagelist_highmark(struct per_cpu_pageset *p,
	1790	unsigned long high)
	1791	{
	1792	struct per_cpu_pages *pcp;
	1793
	1794	pcp = &p->pcp[0]; /* hot list */
	1795	pcp->high = high;
	1796	pcp->batch = max(1UL, high/4);
	1797	if ((high/4) > (PAGE_SHIFT * 8))
	1798	pcp->batch = PAGE_SHIFT * 8;
	1799	}
	1800
	1801
	1802	#ifdef CONFIG_NUMA
	1803	/*
	1804	* Boot pageset table. One per cpu which is going to be used for all
	1805	* zones and all nodes. The parameters will be set in such a way
	1806	* that an item put on a list will immediately be handed over to
	1807	* the buddy list. This is safe since pageset manipulation is done
	1808	* with interrupts disabled.
	1809	*
	1810	* Some NUMA counter updates may also be caught by the boot pagesets.
	1811	*
	1812	* The boot_pagesets must be kept even after bootup is complete for
	1813	* unused processors and/or zones. They do play a role for bootstrapping
	1814	* hotplugged processors.
	1815	*
	1816	* zoneinfo_show() and maybe other functions do
	1817	* not check if the processor is online before following the pageset pointer.
	1818	* Other parts of the kernel may not check if the zone is available.
	1819	*/
	1820	static struct per_cpu_pageset boot_pageset[NR_CPUS];
	1821
	1822	/*
	1823	* Dynamically allocate memory for the
	1824	* per cpu pageset array in struct zone.
	1825	*/
	1826	static int __cpuinit process_zones(int cpu)
	1827	{
	1828	struct zone zone, dzone;
	1829
	1830	for_each_zone(zone) {
	1831
	1832	zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),
	1833	GFP_KERNEL, cpu_to_node(cpu));
	1834	if (!zone_pcp(zone, cpu))
	1835	goto bad;
	1836
	1837	setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone));
	1838
	1839	if (percpu_pagelist_fraction)
	1840	setup_pagelist_highmark(zone_pcp(zone, cpu),
	1841	(zone->present_pages / percpu_pagelist_fraction));
	1842	}
	1843
	1844	return 0;
	1845	bad:
	1846	for_each_zone(dzone) {
	1847	if (dzone == zone)
	1848	break;
	1849	kfree(zone_pcp(dzone, cpu));
	1850	zone_pcp(dzone, cpu) = NULL;
	1851	}
	1852	return -ENOMEM;
	1853	}
	1854
	1855	static inline void free_zone_pagesets(int cpu)
	1856	{
	1857	struct zone *zone;
	1858
	1859	for_each_zone(zone) {
	1860	struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
	1861
	1862	zone_pcp(zone, cpu) = NULL;
	1863	kfree(pset);
	1864	}
	1865	}
	1866
	1867	static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb,
	1868	unsigned long action,
	1869	void *hcpu)
	1870	{
	1871	int cpu = (long)hcpu;
	1872	int ret = NOTIFY_OK;
	1873
	1874	switch (action) {
	1875	case CPU_UP_PREPARE:
	1876	if (process_zones(cpu))
	1877	ret = NOTIFY_BAD;
	1878	break;
	1879	case CPU_UP_CANCELED:
	1880	case CPU_DEAD:
	1881	free_zone_pagesets(cpu);
	1882	break;
	1883	default:
	1884	break;
	1885	}
	1886	return ret;
	1887	}
	1888
	1889	static struct notifier_block __cpuinitdata pageset_notifier =
	1890	{ &pageset_cpuup_callback, NULL, 0 };
	1891
	1892	void __init setup_per_cpu_pageset(void)
	1893	{
	1894	int err;
	1895
	1896	/* Initialize per_cpu_pageset for cpu 0.
	1897	* A cpuup callback will do this for every cpu
	1898	* as it comes online
	1899	*/
	1900	err = process_zones(smp_processor_id());
	1901	BUG_ON(err);
	1902	register_cpu_notifier(&pageset_notifier);
	1903	}
	1904
	1905	#endif
	1906
	1907	static __meminit
	1908	int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
	1909	{
	1910	int i;
	1911	struct pglist_data *pgdat = zone->zone_pgdat;
	1912	size_t alloc_size;
	1913
	1914	/*
	1915	* The per-page waitqueue mechanism uses hashed waitqueues
	1916	* per zone.
	1917	*/
	1918	zone->wait_table_hash_nr_entries =
	1919	wait_table_hash_nr_entries(zone_size_pages);
	1920	zone->wait_table_bits =
	1921	wait_table_bits(zone->wait_table_hash_nr_entries);
	1922	alloc_size = zone->wait_table_hash_nr_entries
	1923	* sizeof(wait_queue_head_t);
	1924
	1925	if (system_state == SYSTEM_BOOTING) {
	1926	zone->wait_table = (wait_queue_head_t *)
	1927	alloc_bootmem_node(pgdat, alloc_size);
	1928	} else {
	1929	/*
	1930	* This case means that a zone whose size was 0 gets new memory
	1931	* via memory hot-add.
	1932	* But it may be the case that a new node was hot-added. In
	1933	* this case vmalloc() will not be able to use this new node's
	1934	* memory - this wait_table must be initialized to use this new
	1935	* node itself as well.
	1936	* To use this new node's memory, further consideration will be
	1937	* necessary.
	1938	*/
	1939	zone->wait_table = (wait_queue_head_t *)vmalloc(alloc_size);
	1940	}
	1941	if (!zone->wait_table)
	1942	return -ENOMEM;
	1943
	1944	for(i = 0; i < zone->wait_table_hash_nr_entries; ++i)
	1945	init_waitqueue_head(zone->wait_table + i);
	1946
	1947	return 0;
	1948	}
	1949
	1950	static __meminit void zone_pcp_init(struct zone *zone)
	1951	{
	1952	int cpu;
	1953	unsigned long batch = zone_batchsize(zone);
	1954
	1955	for (cpu = 0; cpu < NR_CPUS; cpu++) {
	1956	#ifdef CONFIG_NUMA
	1957	/* Early boot. Slab allocator not functional yet */
	1958	zone_pcp(zone, cpu) = &boot_pageset[cpu];
	1959	setup_pageset(&boot_pageset[cpu],0);
	1960	#else
	1961	setup_pageset(zone_pcp(zone,cpu), batch);
	1962	#endif
	1963	}
	1964	if (zone->present_pages)
	1965	printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n",
	1966	zone->name, zone->present_pages, batch);
	1967	}
	1968
	1969	__meminit int init_currently_empty_zone(struct zone *zone,
	1970	unsigned long zone_start_pfn,
	1971	unsigned long size)
	1972	{
	1973	struct pglist_data *pgdat = zone->zone_pgdat;
	1974	int ret;
	1975	ret = zone_wait_table_init(zone, size);
	1976	if (ret)
	1977	return ret;
	1978	pgdat->nr_zones = zone_idx(zone) + 1;
	1979
	1980	zone->zone_start_pfn = zone_start_pfn;
	1981
	1982	memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn);
	1983
	1984	zone_init_free_lists(pgdat, zone, zone->spanned_pages);
	1985
	1986	return 0;
	1987	}
	1988
	1989	/*
	1990	* Set up the zone data structures:
	1991	* - mark all pages reserved
	1992	* - mark all memory queues empty
	1993	* - clear the memory bitmaps
	1994	*/
	1995	static void __meminit free_area_init_core(struct pglist_data *pgdat,
	1996	unsigned long zones_size, unsigned long zholes_size)
	1997	{
	1998	unsigned long j;
	1999	int nid = pgdat->node_id;
	2000	unsigned long zone_start_pfn = pgdat->node_start_pfn;
	2001	int ret;
	2002
	2003	pgdat_resize_init(pgdat);
	2004	pgdat->nr_zones = 0;
	2005	init_waitqueue_head(&pgdat->kswapd_wait);
	2006	pgdat->kswapd_max_order = 0;
	2007
	2008	for (j = 0; j < MAX_NR_ZONES; j++) {
	2009	struct zone *zone = pgdat->node_zones + j;
	2010	unsigned long size, realsize;
	2011
	2012	realsize = size = zones_size[j];
	2013	if (zholes_size)
	2014	realsize -= zholes_size[j];
	2015
	2016	if (j < ZONE_HIGHMEM)
	2017	nr_kernel_pages += realsize;
	2018	nr_all_pages += realsize;
	2019
	2020	zone->spanned_pages = size;
	2021	zone->present_pages = realsize;
	2022	#ifdef CONFIG_NUMA
	2023	zone->min_unmapped_ratio = (realsize*sysctl_min_unmapped_ratio)
	2024	/ 100;
	2025	#endif
	2026	zone->name = zone_names[j];
	2027	spin_lock_init(&zone->lock);
	2028	spin_lock_init(&zone->lru_lock);
	2029	zone_seqlock_init(zone);
	2030	zone->zone_pgdat = pgdat;
	2031	zone->free_pages = 0;
	2032
	2033	zone->temp_priority = zone->prev_priority = DEF_PRIORITY;
	2034
	2035	zone_pcp_init(zone);
	2036	INIT_LIST_HEAD(&zone->active_list);
	2037	INIT_LIST_HEAD(&zone->inactive_list);
	2038	zone->nr_scan_active = 0;
	2039	zone->nr_scan_inactive = 0;
	2040	zone->nr_active = 0;
	2041	zone->nr_inactive = 0;
	2042	zap_zone_vm_stats(zone);
	2043	atomic_set(&zone->reclaim_in_progress, 0);
	2044	if (!size)
	2045	continue;
	2046
	2047	zonetable_add(zone, nid, j, zone_start_pfn, size);
	2048	ret = init_currently_empty_zone(zone, zone_start_pfn, size);
	2049	BUG_ON(ret);
	2050	zone_start_pfn += size;
	2051	}
	2052	}
	2053
	2054	static void __init alloc_node_mem_map(struct pglist_data *pgdat)
	2055	{
	2056	/* Skip empty nodes */
	2057	if (!pgdat->node_spanned_pages)
	2058	return;
	2059
	2060	#ifdef CONFIG_FLAT_NODE_MEM_MAP
	2061	/* ia64 gets its own node_mem_map, before this, without bootmem */
	2062	if (!pgdat->node_mem_map) {
	2063	unsigned long size, start, end;
	2064	struct page *map;
	2065
	2066	/*
	2067	* The zone's endpoints aren't required to be MAX_ORDER
	2068	* aligned but the node_mem_map endpoints must be in order
	2069	* for the buddy allocator to function correctly.
	2070	*/
	2071	start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
	2072	end = pgdat->node_start_pfn + pgdat->node_spanned_pages;
	2073	end = ALIGN(end, MAX_ORDER_NR_PAGES);
	2074	size = (end - start) * sizeof(struct page);
	2075	map = alloc_remap(pgdat->node_id, size);
	2076	if (!map)
	2077	map = alloc_bootmem_node(pgdat, size);
	2078	pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
	2079	}
	2080	#ifdef CONFIG_FLATMEM
	2081	/*
	2082	* With no DISCONTIG, the global mem_map is just set as node 0's
	2083	*/
	2084	if (pgdat == NODE_DATA(0))
	2085	mem_map = NODE_DATA(0)->node_mem_map;
	2086	#endif
	2087	#endif /* CONFIG_FLAT_NODE_MEM_MAP */
	2088	}
	2089
	2090	void __meminit free_area_init_node(int nid, struct pglist_data *pgdat,
	2091	unsigned long *zones_size, unsigned long node_start_pfn,
	2092	unsigned long *zholes_size)
	2093	{
	2094	pgdat->node_id = nid;
	2095	pgdat->node_start_pfn = node_start_pfn;
	2096	calculate_zone_totalpages(pgdat, zones_size, zholes_size);
	2097
	2098	alloc_node_mem_map(pgdat);
	2099
	2100	free_area_init_core(pgdat, zones_size, zholes_size);
	2101	}
	2102
	2103	#ifndef CONFIG_NEED_MULTIPLE_NODES
	2104	static bootmem_data_t contig_bootmem_data;
	2105	struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data };
	2106
	2107	EXPORT_SYMBOL(contig_page_data);
	2108	#endif
	2109
	2110	void __init free_area_init(unsigned long *zones_size)
	2111	{
	2112	free_area_init_node(0, NODE_DATA(0), zones_size,
	2113	__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
	2114	}
	2115
	2116	#ifdef CONFIG_HOTPLUG_CPU
	2117	static int page_alloc_cpu_notify(struct notifier_block *self,
	2118	unsigned long action, void *hcpu)
	2119	{
	2120	int cpu = (unsigned long)hcpu;
	2121
	2122	if (action == CPU_DEAD) {
	2123	local_irq_disable();
	2124	__drain_pages(cpu);
	2125	vm_events_fold_cpu(cpu);
	2126	local_irq_enable();
	2127	refresh_cpu_vm_stats(cpu);
	2128	}
	2129	return NOTIFY_OK;
	2130	}
	2131	#endif /* CONFIG_HOTPLUG_CPU */
	2132
	2133	void __init page_alloc_init(void)
	2134	{
	2135	hotcpu_notifier(page_alloc_cpu_notify, 0);
	2136	}
	2137
	2138	/*
	2139	* calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio
	2140	* or min_free_kbytes changes.
	2141	*/
	2142	static void calculate_totalreserve_pages(void)
	2143	{
	2144	struct pglist_data *pgdat;
	2145	unsigned long reserve_pages = 0;
	2146	int i, j;
	2147
	2148	for_each_online_pgdat(pgdat) {
	2149	for (i = 0; i < MAX_NR_ZONES; i++) {
	2150	struct zone *zone = pgdat->node_zones + i;
	2151	unsigned long max = 0;
	2152
	2153	/* Find valid and maximum lowmem_reserve in the zone */
	2154	for (j = i; j < MAX_NR_ZONES; j++) {
	2155	if (zone->lowmem_reserve[j] > max)
	2156	max = zone->lowmem_reserve[j];
	2157	}
	2158
	2159	/* we treat pages_high as reserved pages. */
	2160	max += zone->pages_high;
	2161
	2162	if (max > zone->present_pages)
	2163	max = zone->present_pages;
	2164	reserve_pages += max;
	2165	}
	2166	}
	2167	totalreserve_pages = reserve_pages;
	2168	}
	2169
	2170	/*
	2171	* setup_per_zone_lowmem_reserve - called whenever
	2172	* sysctl_lower_zone_reserve_ratio changes. Ensures that each zone
	2173	* has a correct pages reserved value, so an adequate number of
	2174	* pages are left in the zone after a successful __alloc_pages().
	2175	*/
	2176	static void setup_per_zone_lowmem_reserve(void)
	2177	{
	2178	struct pglist_data *pgdat;
	2179	int j, idx;
	2180
	2181	for_each_online_pgdat(pgdat) {
	2182	for (j = 0; j < MAX_NR_ZONES; j++) {
	2183	struct zone *zone = pgdat->node_zones + j;
	2184	unsigned long present_pages = zone->present_pages;
	2185
	2186	zone->lowmem_reserve[j] = 0;
	2187
	2188	for (idx = j-1; idx >= 0; idx--) {
	2189	struct zone *lower_zone;
	2190
	2191	if (sysctl_lowmem_reserve_ratio[idx] < 1)
	2192	sysctl_lowmem_reserve_ratio[idx] = 1;
	2193
	2194	lower_zone = pgdat->node_zones + idx;
	2195	lower_zone->lowmem_reserve[j] = present_pages /
	2196	sysctl_lowmem_reserve_ratio[idx];
	2197	present_pages += lower_zone->present_pages;
	2198	}
	2199	}
	2200	}
	2201
	2202	/* update totalreserve_pages */
	2203	calculate_totalreserve_pages();
	2204	}
	2205
	2206	/*
	2207	* setup_per_zone_pages_min - called when min_free_kbytes changes. Ensures
	2208	* that the pages_{min,low,high} values for each zone are set correctly
	2209	* with respect to min_free_kbytes.
	2210	*/
	2211	void setup_per_zone_pages_min(void)
	2212	{
	2213	unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
	2214	unsigned long lowmem_pages = 0;
	2215	struct zone *zone;
	2216	unsigned long flags;
	2217
	2218	/* Calculate total number of !ZONE_HIGHMEM pages */
	2219	for_each_zone(zone) {
	2220	if (!is_highmem(zone))
	2221	lowmem_pages += zone->present_pages;
	2222	}
	2223
	2224	for_each_zone(zone) {
	2225	u64 tmp;
	2226
	2227	spin_lock_irqsave(&zone->lru_lock, flags);
	2228	tmp = (u64)pages_min * zone->present_pages;
	2229	do_div(tmp, lowmem_pages);
	2230	if (is_highmem(zone)) {
	2231	/*
	2232	* __GFP_HIGH and PF_MEMALLOC allocations usually don't
	2233	* need highmem pages, so cap pages_min to a small
	2234	* value here.
	2235	*
	2236	* The (pages_high-pages_low) and (pages_low-pages_min)
	2237	* deltas controls asynch page reclaim, and so should
	2238	* not be capped for highmem.
	2239	*/
	2240	int min_pages;
	2241
	2242	min_pages = zone->present_pages / 1024;
	2243	if (min_pages < SWAP_CLUSTER_MAX)
	2244	min_pages = SWAP_CLUSTER_MAX;
	2245	if (min_pages > 128)
	2246	min_pages = 128;
	2247	zone->pages_min = min_pages;
	2248	} else {
	2249	/*
	2250	* If it's a lowmem zone, reserve a number of pages
	2251	* proportionate to the zone's size.
	2252	*/
	2253	zone->pages_min = tmp;
	2254	}
	2255
	2256	zone->pages_low = zone->pages_min + (tmp >> 2);
	2257	zone->pages_high = zone->pages_min + (tmp >> 1);
	2258	spin_unlock_irqrestore(&zone->lru_lock, flags);
	2259	}
	2260
	2261	/* update totalreserve_pages */
	2262	calculate_totalreserve_pages();
	2263	}
	2264
	2265	/*
	2266	* Initialise min_free_kbytes.
	2267	*
	2268	* For small machines we want it small (128k min). For large machines
	2269	* we want it large (64MB max). But it is not linear, because network
	2270	* bandwidth does not increase linearly with machine size. We use
	2271	*
	2272	* min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
	2273	* min_free_kbytes = sqrt(lowmem_kbytes * 16)
	2274	*
	2275	* which yields
	2276	*
	2277	* 16MB: 512k
	2278	* 32MB: 724k
	2279	* 64MB: 1024k
	2280	* 128MB: 1448k
	2281	* 256MB: 2048k
	2282	* 512MB: 2896k
	2283	* 1024MB: 4096k
	2284	* 2048MB: 5792k
	2285	* 4096MB: 8192k
	2286	* 8192MB: 11584k
	2287	* 16384MB: 16384k
	2288	*/
	2289	static int __init init_per_zone_pages_min(void)
	2290	{
	2291	unsigned long lowmem_kbytes;
	2292
	2293	lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
	2294
	2295	min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
	2296	if (min_free_kbytes < 128)
	2297	min_free_kbytes = 128;
	2298	if (min_free_kbytes > 65536)
	2299	min_free_kbytes = 65536;
	2300	setup_per_zone_pages_min();
	2301	setup_per_zone_lowmem_reserve();
	2302	return 0;
	2303	}
	2304	module_init(init_per_zone_pages_min)
	2305
	2306	/*
	2307	* min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
	2308	* that we can call two helper functions whenever min_free_kbytes
	2309	* changes.
	2310	*/
	2311	int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
	2312	struct file file, void __user buffer, size_t length, loff_t ppos)
	2313	{
	2314	proc_dointvec(table, write, file, buffer, length, ppos);
	2315	setup_per_zone_pages_min();
	2316	return 0;
	2317	}
	2318
	2319	#ifdef CONFIG_NUMA
	2320	int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
	2321	struct file file, void __user buffer, size_t length, loff_t ppos)
	2322	{
	2323	struct zone *zone;
	2324	int rc;
	2325
	2326	rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
	2327	if (rc)
	2328	return rc;
	2329
	2330	for_each_zone(zone)
	2331	zone->min_unmapped_ratio = (zone->present_pages *
	2332	sysctl_min_unmapped_ratio) / 100;
	2333	return 0;
	2334	}
	2335	#endif
	2336
	2337	/*
	2338	* lowmem_reserve_ratio_sysctl_handler - just a wrapper around
	2339	* proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()
	2340	* whenever sysctl_lowmem_reserve_ratio changes.
	2341	*
	2342	* The reserve ratio obviously has absolutely no relation with the
	2343	* pages_min watermarks. The lowmem reserve ratio can only make sense
	2344	* if in function of the boot time zone sizes.
	2345	*/
	2346	int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
	2347	struct file file, void __user buffer, size_t length, loff_t ppos)
	2348	{
	2349	proc_dointvec_minmax(table, write, file, buffer, length, ppos);
	2350	setup_per_zone_lowmem_reserve();
	2351	return 0;
	2352	}
	2353
	2354	/*
	2355	* percpu_pagelist_fraction - changes the pcp->high for each zone on each
	2356	* cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist
	2357	* can have before it gets flushed back to buddy allocator.
	2358	*/
	2359
	2360	int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
	2361	struct file file, void __user buffer, size_t length, loff_t ppos)
	2362	{
	2363	struct zone *zone;
	2364	unsigned int cpu;
	2365	int ret;
	2366
	2367	ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
	2368	if (!write \|\| (ret == -EINVAL))
	2369	return ret;
	2370	for_each_zone(zone) {
	2371	for_each_online_cpu(cpu) {
	2372	unsigned long high;
	2373	high = zone->present_pages / percpu_pagelist_fraction;
	2374	setup_pagelist_highmark(zone_pcp(zone, cpu), high);
	2375	}
	2376	}
	2377	return 0;
	2378	}
	2379
	2380	__initdata int hashdist = HASHDIST_DEFAULT;
	2381
	2382	#ifdef CONFIG_NUMA
	2383	static int __init set_hashdist(char *str)
	2384	{
	2385	if (!str)
	2386	return 0;
	2387	hashdist = simple_strtoul(str, &str, 0);
	2388	return 1;
	2389	}
	2390	__setup("hashdist=", set_hashdist);
	2391	#endif
	2392
	2393	/*
	2394	* allocate a large system hash table from bootmem
	2395	* - it is assumed that the hash table must contain an exact power-of-2
	2396	* quantity of entries
	2397	* - limit is the number of hash buckets, not the total allocation size
	2398	*/
	2399	void __init alloc_large_system_hash(const char tablename,
	2400	unsigned long bucketsize,
	2401	unsigned long numentries,
	2402	int scale,
	2403	int flags,
	2404	unsigned int *_hash_shift,
	2405	unsigned int *_hash_mask,
	2406	unsigned long limit)
	2407	{
	2408	unsigned long long max = limit;
	2409	unsigned long log2qty, size;
	2410	void *table = NULL;
	2411
	2412	/* allow the kernel cmdline to have a say */
	2413	if (!numentries) {
	2414	/* round applicable memory size up to nearest megabyte */
	2415	numentries = (flags & HASH_HIGHMEM) ? nr_all_pages : nr_kernel_pages;
	2416	numentries += (1UL << (20 - PAGE_SHIFT)) - 1;
	2417	numentries >>= 20 - PAGE_SHIFT;
	2418	numentries <<= 20 - PAGE_SHIFT;
	2419
	2420	/* limit to 1 bucket per 2^scale bytes of low memory */
	2421	if (scale > PAGE_SHIFT)
	2422	numentries >>= (scale - PAGE_SHIFT);
	2423	else
	2424	numentries <<= (PAGE_SHIFT - scale);
	2425	}
	2426	numentries = roundup_pow_of_two(numentries);
	2427
	2428	/* limit allocation size to 1/16 total memory by default */
	2429	if (max == 0) {
	2430	max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;
	2431	do_div(max, bucketsize);
	2432	}
	2433
	2434	if (numentries > max)
	2435	numentries = max;
	2436
	2437	log2qty = long_log2(numentries);
	2438
	2439	do {
	2440	size = bucketsize << log2qty;
	2441	if (flags & HASH_EARLY)
	2442	table = alloc_bootmem(size);
	2443	else if (hashdist)
	2444	table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
	2445	else {
	2446	unsigned long order;
	2447	for (order = 0; ((1UL << order) << PAGE_SHIFT) < size; order++)
	2448	;
	2449	table = (void*) __get_free_pages(GFP_ATOMIC, order);
	2450	}
	2451	} while (!table && size > PAGE_SIZE && --log2qty);
	2452
	2453	if (!table)
	2454	panic("Failed to allocate %s hash table\n", tablename);
	2455
	2456	printk("%s hash table entries: %d (order: %d, %lu bytes)\n",
	2457	tablename,
	2458	(1U << log2qty),
	2459	long_log2(size) - PAGE_SHIFT,
	2460	size);
	2461
	2462	if (_hash_shift)
	2463	*_hash_shift = log2qty;
	2464	if (_hash_mask)
	2465	*_hash_mask = (1 << log2qty) - 1;
	2466
	2467	return table;
	2468	}
	2469
	2470	#ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE
	2471	struct page *pfn_to_page(unsigned long pfn)
	2472	{
	2473	return __pfn_to_page(pfn);
	2474	}
	2475	unsigned long page_to_pfn(struct page *page)
	2476	{
	2477	return __page_to_pfn(page);
	2478	}
	2479	EXPORT_SYMBOL(pfn_to_page);
	2480	EXPORT_SYMBOL(page_to_pfn);
	2481	#endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */

Note: See TracBrowser for help on using the repository browser.

Download in other formats: