patch

Index: 9906.2/include/linux/swap.h
--- 9906.2/include/linux/swap.h Thu, 27 Apr 2000 22:11:43 +0200 zcalusic (linux/C/b/20_swap.h 1.4.1.15.1.1 644)
+++ 9906.5/include/linux/swap.h Sun, 07 May 2000 20:39:35 +0200 zcalusic (linux/C/b/20_swap.h 1.4.1.15.1.1.1.1 644)
@@ -87,7 +87,6 @@
 
 /* linux/mm/vmscan.c */
 extern int try_to_free_pages(unsigned int gfp_mask, zone_t *zone);
-extern int swap_out(unsigned int gfp_mask, int priority);
 
 /* linux/mm/page_io.c */
 extern void rw_swap_page(int, struct page *, int);
Index: 9906.2/mm/vmscan.c
--- 9906.2/mm/vmscan.c Thu, 27 Apr 2000 22:11:43 +0200 zcalusic (linux/F/b/13_vmscan.c 1.5.1.22 644)
+++ 9906.5/mm/vmscan.c Sun, 07 May 2000 20:39:35 +0200 zcalusic (linux/F/b/13_vmscan.c 1.5.1.22.2.1 644)
@@ -48,7 +48,6 @@
 	if ((page-mem_map >= max_mapnr) || PageReserved(page))
 		goto out_failed;
 
-	mm->swap_cnt--;
 	/* Don't look at this pte if it's been accessed recently. */
 	if (pte_young(pte)) {
 		/*
@@ -220,8 +219,6 @@
 		result = try_to_swap_out(mm, vma, address, pte, gfp_mask);
 		if (result)
 			return result;
-		if (!mm->swap_cnt)
-			return 0;
 		address += PAGE_SIZE;
 		pte++;
 	} while (address && (address < end));
@@ -251,8 +248,6 @@
 		int result = swap_out_pmd(mm, vma, pmd, address, end, gfp_mask);
 		if (result)
 			return result;
-		if (!mm->swap_cnt)
-			return 0;
 		address = (address + PMD_SIZE) & PMD_MASK;
 		pmd++;
 	} while (address && (address < end));
@@ -277,8 +272,6 @@
 		int result = swap_out_pgd(mm, vma, pgdir, address, end, gfp_mask);
 		if (result)
 			return result;
-		if (!mm->swap_cnt)
-			return 0;
 		address = (address + PGDIR_SIZE) & PGDIR_MASK;
 		pgdir++;
 	} while (address && (address < end));
@@ -328,7 +321,7 @@
  * N.B. This function returns only 0 or 1.  Return values != 1 from
  * the lower level routines result in continued processing.
  */
-int swap_out(unsigned int priority, int gfp_mask)
+static int swap_out(unsigned int priority, int gfp_mask)
 {
 	struct task_struct * p;
 	int counter;
@@ -363,7 +356,6 @@
 		p = init_task.next_task;
 		for (; p != &init_task; p = p->next_task) {
 			struct mm_struct *mm = p->mm;
-			p->hog = 0;
 			if (!p->swappable || !mm)
 				continue;
 	 		if (mm->rss <= 0)
@@ -377,26 +369,9 @@
 				pid = p->pid;
 			}
 		}
-		if (assign == 1) {
-			/* we just assigned swap_cnt, normalise values */
-			assign = 2;
-			p = init_task.next_task;
-			for (; p != &init_task; p = p->next_task) {
-				int i = 0;
-				struct mm_struct *mm = p->mm;
-				if (!p->swappable || !mm || mm->rss <= 0)
-					continue;
-				/* small processes are swapped out less */
-				while ((mm->swap_cnt << 2 * (i + 1) < max_cnt))
-					i++;
-				mm->swap_cnt >>= i;
-				mm->swap_cnt += i; /* if swap_cnt reaches 0 */
-				/* we're big -> hog treatment */
-				if (!i)
-					p->hog = 1;
-			}
-		}
 		read_unlock(&tasklist_lock);
+		if (assign == 1)
+			assign = 2;
 		if (!best) {
 			if (!assign) {
 				assign = 1;
@@ -437,14 +412,13 @@
 {
 	int priority;
 	int count = SWAP_CLUSTER_MAX;
-	int ret;
 
 	/* Always trim SLAB caches when memory gets low. */
 	kmem_cache_reap(gfp_mask);
 
 	priority = 6;
 	do {
-		while ((ret = shrink_mmap(priority, gfp_mask, zone))) {
+		while (shrink_mmap(priority, gfp_mask, zone)) {
 			if (!--count)
 				goto done;
 		}
@@ -467,9 +441,7 @@
 			}
 		}
 
-		/* Then, try to page stuff out..
-		 * We use swapcount here because this doesn't actually
-		 * free pages */
+		/* Then, try to page stuff out.. */
 		while (swap_out(priority, gfp_mask)) {
 			if (!--count)
 				goto done;
@@ -497,10 +469,7 @@
  */
 int kswapd(void *unused)
 {
-	int i;
 	struct task_struct *tsk = current;
-	pg_data_t *pgdat;
-	zone_t *zone;
 
 	tsk->session = 1;
 	tsk->pgrp = 1;
@@ -521,25 +490,38 @@
 	 */
 	tsk->flags |= PF_MEMALLOC;
 
-	while (1) {
+	for (;;) {
+		int work_to_do = 0;
+
 		/*
 		 * If we actually get into a low-memory situation,
 		 * the processes needing more memory will wake us
 		 * up on a more timely basis.
 		 */
-		pgdat = pgdat_list;
-		while (pgdat) {
-			for (i = 0; i < MAX_NR_ZONES; i++) {
-				zone = pgdat->node_zones + i;
-				if (tsk->need_resched)
-					schedule();
-				if ((!zone->size) || (!zone->zone_wake_kswapd))
-					continue;
-				do_try_to_free_pages(GFP_KSWAPD, zone);
+		do {
+			pg_data_t *pgdat = pgdat_list;
+
+			while (pgdat) {
+				int i;
+
+				for (i = 0; i < MAX_NR_ZONES; i++) {
+					zone_t *zone = pgdat->node_zones + i;
+
+					if (!zone->size)
+						continue;
+					if (!zone->low_on_memory)
+						continue;
+					work_to_do = 1;
+					do_try_to_free_pages(GFP_KSWAPD, zone);
+				}
+				pgdat = pgdat->node_next;
 			}
-			pgdat = pgdat->node_next;
-		}
-		run_task_queue(&tq_disk);
+			run_task_queue(&tq_disk);
+			if (tsk->need_resched)
+				break;
+			if (nr_free_pages() > freepages.high)
+				break;
+		} while (work_to_do);
 		tsk->state = TASK_INTERRUPTIBLE;
 		interruptible_sleep_on(&kswapd_wait);
 	}
Index: 9906.2/mm/filemap.c
--- 9906.2/mm/filemap.c Thu, 27 Apr 2000 22:11:43 +0200 zcalusic (linux/F/b/16_filemap.c 1.6.1.3.2.4.1.1.2.2.2.1.1.21.1.1 644)
+++ 9906.5/mm/filemap.c Sun, 07 May 2000 20:39:35 +0200 zcalusic (linux/F/b/16_filemap.c 1.6.1.3.2.4.1.1.2.2.2.1.1.21.1.1.2.1 644)
@@ -238,55 +238,41 @@
 
 int shrink_mmap(int priority, int gfp_mask, zone_t *zone)
 {
-	int ret = 0, loop = 0, count;
+	int ret = 0, count;
 	LIST_HEAD(young);
 	LIST_HEAD(old);
 	LIST_HEAD(forget);
 	struct list_head * page_lru, * dispose;
-	struct page * page = NULL;
-	struct zone_struct * p_zone;
-	int maxloop = 256 >> priority;
+	struct page * page;
 	
 	if (!zone)
 		BUG();
 
-	count = nr_lru_pages >> priority;
-	if (!count)
-		return ret;
+	count = nr_lru_pages / (priority+1);
 
 	spin_lock(&pagemap_lru_lock);
-again:
-	/* we need pagemap_lru_lock for list_del() ... subtle code below */
+
 	while (count > 0 && (page_lru = lru_cache.prev) != &lru_cache) {
 		page = list_entry(page_lru, struct page, lru);
 		list_del(page_lru);
-		p_zone = page->zone;
 
-		/*
-		 * These two tests are there to make sure we don't free too
-		 * many pages from the "wrong" zone. We free some anyway,
-		 * they are the least recently used pages in the system.
-		 * When we don't free them, leave them in &old.
-		 */
-		dispose = &old;
-		if (p_zone != zone && (loop > (maxloop / 4) ||
-				p_zone->free_pages > p_zone->pages_high))
+		dispose = &lru_cache;
+		if (test_and_clear_bit(PG_referenced, &page->flags))
+			/* Roll the page at the top of the lru list,
+			 * we could also be more aggressive putting
+			 * the page in the young-dispose-list, so
+			 * avoiding to free young pages in each pass.
+			 */
 			goto dispose_continue;
 
-		/* The page is in use, or was used very recently, put it in
-		 * &young to make sure that we won't try to free it the next
-		 * time */
-		dispose = &young;
-
-		if (test_and_clear_bit(PG_referenced, &page->flags))
+		dispose = &old;
+		/* don't account passes over not DMA pages */
+		if (zone && (!memclass(page->zone, zone)))
 			goto dispose_continue;
 
 		count--;
-		if (!page->buffers && page_count(page) > 1)
-			goto dispose_continue;
 
-		/* Page not used -> free it; if that fails -> &old */
-		dispose = &old;
+		dispose = &young;
 		if (TryLockPage(page))
 			goto dispose_continue;
 
@@ -297,11 +283,22 @@
 		   page locked down ;). */
 		spin_unlock(&pagemap_lru_lock);
 
+		/* avoid unscalable SMP locking */
+		if (!page->buffers && page_count(page) > 1)
+			goto unlock_noput_continue;
+
+		/* Take the pagecache_lock spinlock held to avoid
+		   other tasks to notice the page while we are looking at its
+		   page count. If it's a pagecache-page we'll free it
+		   in one atomic transaction after checking its page count. */
+		spin_lock(&pagecache_lock);
+
 		/* avoid freeing the page while it's locked */
 		get_page(page);
 
 		/* Is it a buffer page? */
 		if (page->buffers) {
+			spin_unlock(&pagecache_lock);
 			if (!try_to_free_buffers(page))
 				goto unlock_continue;
 			/* page was locked, inode can't go away under us */
@@ -309,14 +306,9 @@
 				atomic_dec(&buffermem_pages);
 				goto made_buffer_progress;
 			}
+			spin_lock(&pagecache_lock);
 		}
 
-		/* Take the pagecache_lock spinlock held to avoid
-		   other tasks to notice the page while we are looking at its
-		   page count. If it's a pagecache-page we'll free it
-		   in one atomic transaction after checking its page count. */
-		spin_lock(&pagecache_lock);
-
 		/*
 		 * We can't free pages unless there's just one user
 		 * (count == 2 because we added one ourselves above).
@@ -325,6 +317,12 @@
 			goto cache_unlock_continue;
 
 		/*
+		 * We did the page aging part.
+		 */
+		if (nr_lru_pages < freepages.min * priority)
+			goto cache_unlock_continue;
+
+		/*
 		 * Is it a page swap page? If so, we want to
 		 * drop it if it is no longer used, even if it
 		 * were to be marked referenced..
@@ -353,13 +351,21 @@
 cache_unlock_continue:
 		spin_unlock(&pagecache_lock);
 unlock_continue:
-		spin_lock(&pagemap_lru_lock);
 		UnlockPage(page);
 		put_page(page);
+dispose_relock_continue:
+		/* even if the dispose list is local, a truncate_inode_page()
+		   may remove a page from its queue so always
+		   synchronize with the lru lock while accesing the
+		   page->lru field */
+		spin_lock(&pagemap_lru_lock);
 		list_add(page_lru, dispose);
 		continue;
 
-		/* we're holding pagemap_lru_lock, so we can just loop again */
+unlock_noput_continue:
+		UnlockPage(page);
+		goto dispose_relock_continue;
+
 dispose_continue:
 		list_add(page_lru, dispose);
 	}
@@ -374,11 +380,6 @@
 	spin_lock(&pagemap_lru_lock);
 	/* nr_lru_pages needs the spinlock */
 	nr_lru_pages--;
-
-	loop++;
-	/* wrong zone?  not looped too often?    roll again... */
-	if (page->zone != zone && loop < maxloop)
-		goto again;
 
 out:
 	list_splice(&young, &lru_cache);
Index: 9906.2/mm/page_alloc.c
--- 9906.2/mm/page_alloc.c Thu, 27 Apr 2000 22:11:43 +0200 zcalusic (linux/F/b/18_page_alloc 1.5.2.21 644)
+++ 9906.5/mm/page_alloc.c Sun, 07 May 2000 20:39:35 +0200 zcalusic (linux/F/b/18_page_alloc 1.5.2.21.2.1 644)
@@ -58,8 +58,6 @@
  */
 #define BAD_RANGE(zone,x) (((zone) != (x)->zone) || (((x)-mem_map) < (zone)->offset) || (((x)-mem_map) >= (zone)->offset+(zone)->size))
 
-#if 0
-
 static inline unsigned long classfree(zone_t *zone)
 {
 	unsigned long free = 0;
@@ -73,8 +71,6 @@
 	return(free);
 }
 
-#endif
-
 /*
  * Buddy system. Hairy. You really aren't expected to understand this
  *
@@ -156,10 +152,8 @@
 
 	spin_unlock_irqrestore(&zone->lock, flags);
 
-	if (zone->free_pages > zone->pages_high) {
-		zone->zone_wake_kswapd = 0;
+	if (zone->free_pages > zone->pages_high)
 		zone->low_on_memory = 0;
-	}
 }
 
 #define MARK_USED(index, order, area) \
@@ -186,8 +180,7 @@
 	return page;
 }
 
-static FASTCALL(struct page * rmqueue(zone_t *zone, unsigned long order));
-static struct page * rmqueue(zone_t *zone, unsigned long order)
+static inline struct page * rmqueue(zone_t *zone, unsigned long order)
 {
 	free_area_t * area = zone->free_area + order;
 	unsigned long curr_order = order;
@@ -227,115 +220,72 @@
 	return NULL;
 }
 
-static int zone_balance_memory(zonelist_t *zonelist)
-{
-	int tried = 0, freed = 0;
-	zone_t **zone;
-	int gfp_mask = zonelist->gfp_mask;
-	extern wait_queue_head_t kswapd_wait;
-
-	zone = zonelist->zones;
-	for (;;) {
-		zone_t *z = *(zone++);
-		if (!z)
-			break;
-		if (z->free_pages > z->pages_low)
-			continue;
-
-		z->zone_wake_kswapd = 1;
-		wake_up_interruptible(&kswapd_wait);
-
-		/* Are we reaching the critical stage? */
-		if (!z->low_on_memory) {
-			/* Not yet critical, so let kswapd handle it.. */
-			if (z->free_pages > z->pages_min)
-				continue;
-			z->low_on_memory = 1;
-		}
-		/*
-		 * In the atomic allocation case we only 'kick' the
-		 * state machine, but do not try to free pages
-		 * ourselves.
-		 */
-		tried = 1;
-		freed |= try_to_free_pages(gfp_mask, z);
-	}
-	if (tried && !freed) {
-		if (!(gfp_mask & __GFP_HIGH))
-			return 0;
-	}
-	return 1;
-}
-
 /*
  * This is the 'heart' of the zoned buddy allocator:
  */
 struct page * __alloc_pages(zonelist_t *zonelist, unsigned long order)
 {
 	zone_t **zone = zonelist->zones;
-	int gfp_mask = zonelist->gfp_mask;
-	static int low_on_memory;
-
-	/*
-	 * If this is a recursive call, we'd better
-	 * do our best to just allocate things without
-	 * further thought.
-	 */
-	if (current->flags & PF_MEMALLOC)
-		goto allocate_ok;
-
-	/* If we're a memory hog, unmap some pages */
-	if (current->hog && low_on_memory &&
-			(gfp_mask & __GFP_WAIT))
-		swap_out(4, gfp_mask);
 
 	/*
 	 * (If anyone calls gfp from interrupts nonatomically then it
-	 * will sooner or later tripped up by a schedule().)
+	 * will be sooner or later tripped up by a schedule().)
 	 *
 	 * We are falling back to lower-level zones if allocation
 	 * in a higher zone fails.
 	 */
 	for (;;) {
 		zone_t *z = *(zone++);
+
 		if (!z)
 			break;
+
 		if (!z->size)
 			BUG();
 
-		/* Are we supposed to free memory? Don't make it worse.. */
-		if (!z->zone_wake_kswapd && z->free_pages > z->pages_low) {
+		/*
+		 * If this is a recursive call, we'd better
+		 * do our best to just allocate things without
+		 * further thought.
+		 */
+		if (!(current->flags & PF_MEMALLOC)) {
+			if (z->free_pages <= z->pages_high) {
+				unsigned long free = classfree(z);
+
+				if (free <= z->pages_low) {
+					extern wait_queue_head_t kswapd_wait;
+
+					z->low_on_memory = 1;
+					wake_up_interruptible(&kswapd_wait);
+				}
+
+				if (free <= z->pages_min) {
+					int gfp_mask = zonelist->gfp_mask;
+
+					if (!try_to_free_pages(gfp_mask, z)) {
+						if (!(gfp_mask & __GFP_HIGH))
+							return NULL;
+					}
+				}
+			}
+		}
+
+		/*
+		 * This is an optimization for the 'higher order zone
+		 * is empty' case - it can happen even in well-behaved
+		 * systems, think the page-cache filling up all RAM.
+		 * We skip over empty zones. (this is not exact because
+		 * we do not take the spinlock and it's not exact for
+		 * the higher order case, but will do it for most things.)
+		 */
+		if (z->free_pages) {
 			struct page *page = rmqueue(z, order);
-			low_on_memory = 0;
+
 			if (page)
 				return page;
 		}
 	}
-
-	low_on_memory = 1;
-	/*
-	 * Ok, no obvious zones were available, start
-	 * balancing things a bit..
-	 */
-	if (zone_balance_memory(zonelist)) {
-		zone = zonelist->zones;
-allocate_ok:
-		for (;;) {
-			zone_t *z = *(zone++);
-			if (!z)
-				break;
-			if (z->free_pages) {
-				struct page *page = rmqueue(z, order);
-				if (page)
-					return page;
-			}
-		}
-	}
 	return NULL;
-
-/*
- * The main chunk of the balancing code is in this offline branch:
- */
 }
 
 /*
@@ -599,7 +549,6 @@
 		zone->pages_low = mask*2;
 		zone->pages_high = mask*3;
 		zone->low_on_memory = 0;
-		zone->zone_wake_kswapd = 0;
 		zone->zone_mem_map = mem_map + offset;
 		zone->zone_start_mapnr = offset;
 		zone->zone_start_paddr = zone_start_paddr;
@@ -642,7 +591,8 @@
 
 	while (get_option(&str, &zone_balance_ratio[j++]) == 2);
 	printk("setup_mem_frac: ");
-	for (j = 0; j < MAX_NR_ZONES; j++) printk("%d  ", zone_balance_ratio[j]);
+	for (j = 0; j < MAX_NR_ZONES; j++)
+		printk("%d  ", zone_balance_ratio[j]);
 	printk("\n");
 	return 1;
 }
Index: 9906.2/include/linux/mmzone.h
--- 9906.2/include/linux/mmzone.h Thu, 27 Apr 2000 22:11:43 +0200 zcalusic (linux/u/c/2_mmzone.h 1.9 644)
+++ 9906.5/include/linux/mmzone.h Sun, 07 May 2000 20:39:35 +0200 zcalusic (linux/u/c/2_mmzone.h 1.10 644)
@@ -29,7 +29,6 @@
 	unsigned long		offset;
 	unsigned long		free_pages;
 	char			low_on_memory;
-	char			zone_wake_kswapd;
 	unsigned long		pages_min, pages_low, pages_high;
 
 	/*