Index: 9906.2/include/linux/swap.h
--- 9906.2/include/linux/swap.h Thu, 27 Apr 2000 22:11:43 +0200 zcalusic (linux/C/b/20_swap.h 1.4.1.15.1.1 644)
+++ 9906.5/include/linux/swap.h Sun, 07 May 2000 20:39:35 +0200 zcalusic (linux/C/b/20_swap.h 1.4.1.15.1.1.1.1 644)
@@ -87,7 +87,6 @@
/* linux/mm/vmscan.c */
extern int try_to_free_pages(unsigned int gfp_mask, zone_t *zone);
-extern int swap_out(unsigned int gfp_mask, int priority);
/* linux/mm/page_io.c */
extern void rw_swap_page(int, struct page *, int);
Index: 9906.2/mm/vmscan.c
--- 9906.2/mm/vmscan.c Thu, 27 Apr 2000 22:11:43 +0200 zcalusic (linux/F/b/13_vmscan.c 1.5.1.22 644)
+++ 9906.5/mm/vmscan.c Sun, 07 May 2000 20:39:35 +0200 zcalusic (linux/F/b/13_vmscan.c 1.5.1.22.2.1 644)
@@ -48,7 +48,6 @@
if ((page-mem_map >= max_mapnr) || PageReserved(page))
goto out_failed;
- mm->swap_cnt--;
/* Don't look at this pte if it's been accessed recently. */
if (pte_young(pte)) {
/*
@@ -220,8 +219,6 @@
result = try_to_swap_out(mm, vma, address, pte, gfp_mask);
if (result)
return result;
- if (!mm->swap_cnt)
- return 0;
address += PAGE_SIZE;
pte++;
} while (address && (address < end));
@@ -251,8 +248,6 @@
int result = swap_out_pmd(mm, vma, pmd, address, end, gfp_mask);
if (result)
return result;
- if (!mm->swap_cnt)
- return 0;
address = (address + PMD_SIZE) & PMD_MASK;
pmd++;
} while (address && (address < end));
@@ -277,8 +272,6 @@
int result = swap_out_pgd(mm, vma, pgdir, address, end, gfp_mask);
if (result)
return result;
- if (!mm->swap_cnt)
- return 0;
address = (address + PGDIR_SIZE) & PGDIR_MASK;
pgdir++;
} while (address && (address < end));
@@ -328,7 +321,7 @@
* N.B. This function returns only 0 or 1. Return values != 1 from
* the lower level routines result in continued processing.
*/
-int swap_out(unsigned int priority, int gfp_mask)
+static int swap_out(unsigned int priority, int gfp_mask)
{
struct task_struct * p;
int counter;
@@ -363,7 +356,6 @@
p = init_task.next_task;
for (; p != &init_task; p = p->next_task) {
struct mm_struct *mm = p->mm;
- p->hog = 0;
if (!p->swappable || !mm)
continue;
if (mm->rss <= 0)
@@ -377,26 +369,9 @@
pid = p->pid;
}
}
- if (assign == 1) {
- /* we just assigned swap_cnt, normalise values */
- assign = 2;
- p = init_task.next_task;
- for (; p != &init_task; p = p->next_task) {
- int i = 0;
- struct mm_struct *mm = p->mm;
- if (!p->swappable || !mm || mm->rss <= 0)
- continue;
- /* small processes are swapped out less */
- while ((mm->swap_cnt << 2 * (i + 1) < max_cnt))
- i++;
- mm->swap_cnt >>= i;
- mm->swap_cnt += i; /* if swap_cnt reaches 0 */
- /* we're big -> hog treatment */
- if (!i)
- p->hog = 1;
- }
- }
read_unlock(&tasklist_lock);
+ if (assign == 1)
+ assign = 2;
if (!best) {
if (!assign) {
assign = 1;
@@ -437,14 +412,13 @@
{
int priority;
int count = SWAP_CLUSTER_MAX;
- int ret;
/* Always trim SLAB caches when memory gets low. */
kmem_cache_reap(gfp_mask);
priority = 6;
do {
- while ((ret = shrink_mmap(priority, gfp_mask, zone))) {
+ while (shrink_mmap(priority, gfp_mask, zone)) {
if (!--count)
goto done;
}
@@ -467,9 +441,7 @@
}
}
- /* Then, try to page stuff out..
- * We use swapcount here because this doesn't actually
- * free pages */
+ /* Then, try to page stuff out.. */
while (swap_out(priority, gfp_mask)) {
if (!--count)
goto done;
@@ -497,10 +469,7 @@
*/
int kswapd(void *unused)
{
- int i;
struct task_struct *tsk = current;
- pg_data_t *pgdat;
- zone_t *zone;
tsk->session = 1;
tsk->pgrp = 1;
@@ -521,25 +490,38 @@
*/
tsk->flags |= PF_MEMALLOC;
- while (1) {
+ for (;;) {
+ int work_to_do = 0;
+
/*
* If we actually get into a low-memory situation,
* the processes needing more memory will wake us
* up on a more timely basis.
*/
- pgdat = pgdat_list;
- while (pgdat) {
- for (i = 0; i < MAX_NR_ZONES; i++) {
- zone = pgdat->node_zones + i;
- if (tsk->need_resched)
- schedule();
- if ((!zone->size) || (!zone->zone_wake_kswapd))
- continue;
- do_try_to_free_pages(GFP_KSWAPD, zone);
+ do {
+ pg_data_t *pgdat = pgdat_list;
+
+ while (pgdat) {
+ int i;
+
+ for (i = 0; i < MAX_NR_ZONES; i++) {
+ zone_t *zone = pgdat->node_zones + i;
+
+ if (!zone->size)
+ continue;
+ if (!zone->low_on_memory)
+ continue;
+ work_to_do = 1;
+ do_try_to_free_pages(GFP_KSWAPD, zone);
+ }
+ pgdat = pgdat->node_next;
}
- pgdat = pgdat->node_next;
- }
- run_task_queue(&tq_disk);
+ run_task_queue(&tq_disk);
+ if (tsk->need_resched)
+ break;
+ if (nr_free_pages() > freepages.high)
+ break;
+ } while (work_to_do);
tsk->state = TASK_INTERRUPTIBLE;
interruptible_sleep_on(&kswapd_wait);
}
Index: 9906.2/mm/filemap.c
--- 9906.2/mm/filemap.c Thu, 27 Apr 2000 22:11:43 +0200 zcalusic (linux/F/b/16_filemap.c 1.6.1.3.2.4.1.1.2.2.2.1.1.21.1.1 644)
+++ 9906.5/mm/filemap.c Sun, 07 May 2000 20:39:35 +0200 zcalusic (linux/F/b/16_filemap.c 1.6.1.3.2.4.1.1.2.2.2.1.1.21.1.1.2.1 644)
@@ -238,55 +238,41 @@
int shrink_mmap(int priority, int gfp_mask, zone_t *zone)
{
- int ret = 0, loop = 0, count;
+ int ret = 0, count;
LIST_HEAD(young);
LIST_HEAD(old);
LIST_HEAD(forget);
struct list_head * page_lru, * dispose;
- struct page * page = NULL;
- struct zone_struct * p_zone;
- int maxloop = 256 >> priority;
+ struct page * page;
if (!zone)
BUG();
- count = nr_lru_pages >> priority;
- if (!count)
- return ret;
+ count = nr_lru_pages / (priority+1);
spin_lock(&pagemap_lru_lock);
-again:
- /* we need pagemap_lru_lock for list_del() ... subtle code below */
+
while (count > 0 && (page_lru = lru_cache.prev) != &lru_cache) {
page = list_entry(page_lru, struct page, lru);
list_del(page_lru);
- p_zone = page->zone;
- /*
- * These two tests are there to make sure we don't free too
- * many pages from the "wrong" zone. We free some anyway,
- * they are the least recently used pages in the system.
- * When we don't free them, leave them in &old.
- */
- dispose = &old;
- if (p_zone != zone && (loop > (maxloop / 4) ||
- p_zone->free_pages > p_zone->pages_high))
+ dispose = &lru_cache;
+ if (test_and_clear_bit(PG_referenced, &page->flags))
+ /* Roll the page at the top of the lru list,
+ * we could also be more aggressive putting
+ * the page in the young-dispose-list, so
+ * avoiding to free young pages in each pass.
+ */
goto dispose_continue;
- /* The page is in use, or was used very recently, put it in
- * &young to make sure that we won't try to free it the next
- * time */
- dispose = &young;
-
- if (test_and_clear_bit(PG_referenced, &page->flags))
+ dispose = &old;
+ /* don't account passes over not DMA pages */
+ if (zone && (!memclass(page->zone, zone)))
goto dispose_continue;
count--;
- if (!page->buffers && page_count(page) > 1)
- goto dispose_continue;
- /* Page not used -> free it; if that fails -> &old */
- dispose = &old;
+ dispose = &young;
if (TryLockPage(page))
goto dispose_continue;
@@ -297,11 +283,22 @@
page locked down ;). */
spin_unlock(&pagemap_lru_lock);
+ /* avoid unscalable SMP locking */
+ if (!page->buffers && page_count(page) > 1)
+ goto unlock_noput_continue;
+
+ /* Take the pagecache_lock spinlock held to avoid
+ other tasks to notice the page while we are looking at its
+ page count. If it's a pagecache-page we'll free it
+ in one atomic transaction after checking its page count. */
+ spin_lock(&pagecache_lock);
+
/* avoid freeing the page while it's locked */
get_page(page);
/* Is it a buffer page? */
if (page->buffers) {
+ spin_unlock(&pagecache_lock);
if (!try_to_free_buffers(page))
goto unlock_continue;
/* page was locked, inode can't go away under us */
@@ -309,14 +306,9 @@
atomic_dec(&buffermem_pages);
goto made_buffer_progress;
}
+ spin_lock(&pagecache_lock);
}
- /* Take the pagecache_lock spinlock held to avoid
- other tasks to notice the page while we are looking at its
- page count. If it's a pagecache-page we'll free it
- in one atomic transaction after checking its page count. */
- spin_lock(&pagecache_lock);
-
/*
* We can't free pages unless there's just one user
* (count == 2 because we added one ourselves above).
@@ -325,6 +317,12 @@
goto cache_unlock_continue;
/*
+ * We did the page aging part.
+ */
+ if (nr_lru_pages < freepages.min * priority)
+ goto cache_unlock_continue;
+
+ /*
* Is it a page swap page? If so, we want to
* drop it if it is no longer used, even if it
* were to be marked referenced..
@@ -353,13 +351,21 @@
cache_unlock_continue:
spin_unlock(&pagecache_lock);
unlock_continue:
- spin_lock(&pagemap_lru_lock);
UnlockPage(page);
put_page(page);
+dispose_relock_continue:
+ /* even if the dispose list is local, a truncate_inode_page()
+ may remove a page from its queue so always
+ synchronize with the lru lock while accesing the
+ page->lru field */
+ spin_lock(&pagemap_lru_lock);
list_add(page_lru, dispose);
continue;
- /* we're holding pagemap_lru_lock, so we can just loop again */
+unlock_noput_continue:
+ UnlockPage(page);
+ goto dispose_relock_continue;
+
dispose_continue:
list_add(page_lru, dispose);
}
@@ -374,11 +380,6 @@
spin_lock(&pagemap_lru_lock);
/* nr_lru_pages needs the spinlock */
nr_lru_pages--;
-
- loop++;
- /* wrong zone? not looped too often? roll again... */
- if (page->zone != zone && loop < maxloop)
- goto again;
out:
list_splice(&young, &lru_cache);
Index: 9906.2/mm/page_alloc.c
--- 9906.2/mm/page_alloc.c Thu, 27 Apr 2000 22:11:43 +0200 zcalusic (linux/F/b/18_page_alloc 1.5.2.21 644)
+++ 9906.5/mm/page_alloc.c Sun, 07 May 2000 20:39:35 +0200 zcalusic (linux/F/b/18_page_alloc 1.5.2.21.2.1 644)
@@ -58,8 +58,6 @@
*/
#define BAD_RANGE(zone,x) (((zone) != (x)->zone) || (((x)-mem_map) < (zone)->offset) || (((x)-mem_map) >= (zone)->offset+(zone)->size))
-#if 0
-
static inline unsigned long classfree(zone_t *zone)
{
unsigned long free = 0;
@@ -73,8 +71,6 @@
return(free);
}
-#endif
-
/*
* Buddy system. Hairy. You really aren't expected to understand this
*
@@ -156,10 +152,8 @@
spin_unlock_irqrestore(&zone->lock, flags);
- if (zone->free_pages > zone->pages_high) {
- zone->zone_wake_kswapd = 0;
+ if (zone->free_pages > zone->pages_high)
zone->low_on_memory = 0;
- }
}
#define MARK_USED(index, order, area) \
@@ -186,8 +180,7 @@
return page;
}
-static FASTCALL(struct page * rmqueue(zone_t *zone, unsigned long order));
-static struct page * rmqueue(zone_t *zone, unsigned long order)
+static inline struct page * rmqueue(zone_t *zone, unsigned long order)
{
free_area_t * area = zone->free_area + order;
unsigned long curr_order = order;
@@ -227,115 +220,72 @@
return NULL;
}
-static int zone_balance_memory(zonelist_t *zonelist)
-{
- int tried = 0, freed = 0;
- zone_t **zone;
- int gfp_mask = zonelist->gfp_mask;
- extern wait_queue_head_t kswapd_wait;
-
- zone = zonelist->zones;
- for (;;) {
- zone_t *z = *(zone++);
- if (!z)
- break;
- if (z->free_pages > z->pages_low)
- continue;
-
- z->zone_wake_kswapd = 1;
- wake_up_interruptible(&kswapd_wait);
-
- /* Are we reaching the critical stage? */
- if (!z->low_on_memory) {
- /* Not yet critical, so let kswapd handle it.. */
- if (z->free_pages > z->pages_min)
- continue;
- z->low_on_memory = 1;
- }
- /*
- * In the atomic allocation case we only 'kick' the
- * state machine, but do not try to free pages
- * ourselves.
- */
- tried = 1;
- freed |= try_to_free_pages(gfp_mask, z);
- }
- if (tried && !freed) {
- if (!(gfp_mask & __GFP_HIGH))
- return 0;
- }
- return 1;
-}
-
/*
* This is the 'heart' of the zoned buddy allocator:
*/
struct page * __alloc_pages(zonelist_t *zonelist, unsigned long order)
{
zone_t **zone = zonelist->zones;
- int gfp_mask = zonelist->gfp_mask;
- static int low_on_memory;
-
- /*
- * If this is a recursive call, we'd better
- * do our best to just allocate things without
- * further thought.
- */
- if (current->flags & PF_MEMALLOC)
- goto allocate_ok;
-
- /* If we're a memory hog, unmap some pages */
- if (current->hog && low_on_memory &&
- (gfp_mask & __GFP_WAIT))
- swap_out(4, gfp_mask);
/*
* (If anyone calls gfp from interrupts nonatomically then it
- * will sooner or later tripped up by a schedule().)
+ * will be sooner or later tripped up by a schedule().)
*
* We are falling back to lower-level zones if allocation
* in a higher zone fails.
*/
for (;;) {
zone_t *z = *(zone++);
+
if (!z)
break;
+
if (!z->size)
BUG();
- /* Are we supposed to free memory? Don't make it worse.. */
- if (!z->zone_wake_kswapd && z->free_pages > z->pages_low) {
+ /*
+ * If this is a recursive call, we'd better
+ * do our best to just allocate things without
+ * further thought.
+ */
+ if (!(current->flags & PF_MEMALLOC)) {
+ if (z->free_pages <= z->pages_high) {
+ unsigned long free = classfree(z);
+
+ if (free <= z->pages_low) {
+ extern wait_queue_head_t kswapd_wait;
+
+ z->low_on_memory = 1;
+ wake_up_interruptible(&kswapd_wait);
+ }
+
+ if (free <= z->pages_min) {
+ int gfp_mask = zonelist->gfp_mask;
+
+ if (!try_to_free_pages(gfp_mask, z)) {
+ if (!(gfp_mask & __GFP_HIGH))
+ return NULL;
+ }
+ }
+ }
+ }
+
+ /*
+ * This is an optimization for the 'higher order zone
+ * is empty' case - it can happen even in well-behaved
+ * systems, think the page-cache filling up all RAM.
+ * We skip over empty zones. (this is not exact because
+ * we do not take the spinlock and it's not exact for
+ * the higher order case, but will do it for most things.)
+ */
+ if (z->free_pages) {
struct page *page = rmqueue(z, order);
- low_on_memory = 0;
+
if (page)
return page;
}
}
-
- low_on_memory = 1;
- /*
- * Ok, no obvious zones were available, start
- * balancing things a bit..
- */
- if (zone_balance_memory(zonelist)) {
- zone = zonelist->zones;
-allocate_ok:
- for (;;) {
- zone_t *z = *(zone++);
- if (!z)
- break;
- if (z->free_pages) {
- struct page *page = rmqueue(z, order);
- if (page)
- return page;
- }
- }
- }
return NULL;
-
-/*
- * The main chunk of the balancing code is in this offline branch:
- */
}
/*
@@ -599,7 +549,6 @@
zone->pages_low = mask*2;
zone->pages_high = mask*3;
zone->low_on_memory = 0;
- zone->zone_wake_kswapd = 0;
zone->zone_mem_map = mem_map + offset;
zone->zone_start_mapnr = offset;
zone->zone_start_paddr = zone_start_paddr;
@@ -642,7 +591,8 @@
while (get_option(&str, &zone_balance_ratio[j++]) == 2);
printk("setup_mem_frac: ");
- for (j = 0; j < MAX_NR_ZONES; j++) printk("%d ", zone_balance_ratio[j]);
+ for (j = 0; j < MAX_NR_ZONES; j++)
+ printk("%d ", zone_balance_ratio[j]);
printk("\n");
return 1;
}
Index: 9906.2/include/linux/mmzone.h
--- 9906.2/include/linux/mmzone.h Thu, 27 Apr 2000 22:11:43 +0200 zcalusic (linux/u/c/2_mmzone.h 1.9 644)
+++ 9906.5/include/linux/mmzone.h Sun, 07 May 2000 20:39:35 +0200 zcalusic (linux/u/c/2_mmzone.h 1.10 644)
@@ -29,7 +29,6 @@
unsigned long offset;
unsigned long free_pages;
char low_on_memory;
- char zone_wake_kswapd;
unsigned long pages_min, pages_low, pages_high;
/*