Hello,

(this time it's for real ;-)


DESCRIPTION
===========

This is my first working attempt at adding file mapping support to uClinux.
It received some light testing and *appears* to work fine. The main
motivation behind this patch is saving memory when loading binaries
from read/write filesystems such as JFFS2 that don't support in-place
execution. Combining this patch with FLAT shared libraries support
reduces memory usage on a typical uClinux system.


IMPLEMENTATION
==============

The address_space structure keeps track of memory mappings for inodes.
In uClinux we can't rely on paging, therefore we need to allocate a
contiguous memory block and read the file into it at once. This is done
by generic_file_mmap().

The difficult part is getting rid of the allocated block when all
processes have finished using it. I added a usage counter in the
address_space structure and added a link in mm_rblock_struct to the
the mapped file.

The code for handling rblock and tblock structures got more complex
and is now needed in multiple places. Therefore I decided to fold it
into separate functions taking care of all details: create_tblock()
and delete_tblock(). As a bonus, do_mmap_pgoff() got much shorter
and cleaner.

There's an ugly asymmetry here: those mmaped blocks are allocated in
mm/filemap.c:generic_file_read() and freed in mm/nommu.c:delete_tblock().
It's unfortunate there's no generic_file_munmap().


KNOWN PROBLEMS
==============

This patch is quite invasive and unreadable. I'm releasing it
mainly to get some early feedback.

I'm not quite sure about locking issues, and I have a strong feeling
I should have put back VMA stuff in nommu.c instead of messing with
tblocks to enable file map tracking.

I'm also violating some of mmap() semantics, but of course
you can't have a full mmap() implementationwithout an MMU.
Read-only mapping is not guaranteed to be read-only, and writes won't
be flushed to disk. Also, we assume all maps are of the same size and
at the same file offset. What we have is just good enough for sharing
mapped binaries among processes.




diff -Nur linux-2.5.70-uc0/include/asm-m68knommu/mmu.h linux-2.5.x/include/asm-m68knommu/mmu.h
--- linux-2.5.70-uc0/include/asm-m68knommu/mmu.h	2003-05-27 03:00:44.000000000 +0200
+++ linux-2.5.x/include/asm-m68knommu/mmu.h	2003-06-01 02:19:39.000000000 +0200
@@ -4,9 +4,10 @@
 /* Copyright (C) 2002, David McCullough <davidm@snapgear.com> */
 
 struct mm_rblock_struct {
-	int	size;
-	int	refcount;
-	void	*kblock;
+	int		size;
+	int		refcount;
+	struct file	*file;
+	void		*kblock;
 };
 
 struct mm_tblock_struct {
diff -Nur linux-2.5.70-uc0/include/linux/fs.h linux-2.5.x/include/linux/fs.h
--- linux-2.5.70-uc0/include/linux/fs.h	2003-05-27 03:00:26.000000000 +0200
+++ linux-2.5.x/include/linux/fs.h	2003-06-01 15:03:12.000000000 +0200
@@ -320,9 +320,14 @@
 	struct list_head	io_pages;	/* being prepared for I/O */
 	unsigned long		nrpages;	/* number of total pages */
 	struct address_space_operations *a_ops;	/* methods */
+#ifdef CONFIG_MMU
 	struct list_head	i_mmap;		/* list of private mappings */
 	struct list_head	i_mmap_shared;	/* list of shared mappings */
 	struct semaphore	i_shared_sem;	/* protect both above lists */
+#else /* !CONFIG_MMU */
+	char *			i_mmap_block;	/* Pointer to memory buffer for mmapped file */
+	int			i_mmap_cnt;	/* Track use count of i_mmap_block (FIXME: shall we use atomic_t?) */
+#endif /* !CONFIG_MMU */
 	unsigned long		dirtied_when;	/* jiffies of first page dirtying */
 	int			gfp_mask;	/* how to allocate the pages */
 	struct backing_dev_info *backing_dev_info; /* device readahead, etc */
diff -Nur linux-2.5.70-uc0/mm/filemap.c linux-2.5.x/mm/filemap.c
--- linux-2.5.70-uc0/mm/filemap.c	2003-05-27 03:00:37.000000000 +0200
+++ linux-2.5.x/mm/filemap.c	2003-06-01 23:57:31.000000000 +0200
@@ -40,6 +40,9 @@
 #include <asm/uaccess.h>
 #include <asm/mman.h>
 
+/* Turn on verbose debug messages for mmap related stuff */
+#undef DEBUG_MMAP
+
 /*
  * Shared mappings implemented 30.11.1994. It's not fully working yet,
  * though.
@@ -582,12 +585,14 @@
 		if (!PageUptodate(page))
 			goto page_not_up_to_date;
 page_ok:
+#ifdef CONFIG_MMU
 		/* If users can be writing to this page using arbitrary
 		 * virtual addresses, take care about potential aliasing
 		 * before reading the page on the kernel side.
 		 */
 		if (!list_empty(&mapping->i_mmap_shared))
 			flush_dcache_page(page);
+#endif /* CONFIG_MMU */
 
 		/*
 		 * Mark the page accessed if we read the beginning.
@@ -1264,6 +1269,56 @@
 	return 0;
 }
 
+#else /* !CONFIG_MMU */
+
+int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
+{
+	struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
+	size_t error;
+	mm_segment_t old_fs;
+	size_t len = vma->vm_end - vma->vm_start;
+
+	if (++mapping->i_mmap_cnt == 1)
+	{
+#ifdef DEBUG_MMAP
+		printk("generic_file_mmap(): allocating %u bytes for inode #%ld\n",
+			len, mapping->host->i_ino);
+#endif
+		if (!(mapping->i_mmap_block = kmalloc(len, GFP_KERNEL)))
+		{
+			--mapping->i_mmap_cnt;
+			return -ENOMEM;
+		}
+	}
+#ifdef DEBUG_MMAP
+	else
+		printk("mmap: reusing allocated block for inode #%ld (i_mmap_cnt = %d)\n",
+			mapping->host->i_ino, mapping->i_mmap_cnt);
+#endif
+
+	old_fs = get_fs();
+	set_fs(KERNEL_DS);
+	error = file->f_op->read(file, mapping->i_mmap_block, len, &file->f_pos);
+	set_fs(old_fs);
+
+	if (error < 0) {
+		if (--mapping->i_mmap_cnt == 0)
+		{
+			kfree(mapping->i_mmap_block);
+			mapping->i_mmap_block = NULL;
+		}
+		return error;
+	}
+
+	/* Clear rest of mapped block */
+	if (error < len)
+		memset(mapping->i_mmap_block + error, 0, len - error);
+
+	vma->vm_start = (unsigned long)mapping->i_mmap_block;
+	return 0;
+}
+#endif /* !CONFIG_MMU */
+
 /*
  * This is for filesystems which do not implement ->writepage.
  */
@@ -1273,16 +1328,6 @@
 		return -EINVAL;
 	return generic_file_mmap(file, vma);
 }
-#else
-int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
-{
-	return -ENOSYS;
-}
-int generic_file_readonly_mmap(struct file * file, struct vm_area_struct * vma)
-{
-	return -ENOSYS;
-}
-#endif /* CONFIG_MMU */
 
 static inline struct page *__read_cache_page(struct address_space *mapping,
 				unsigned long index,
diff -Nur linux-2.5.70-uc0/mm/nommu.c linux-2.5.x/mm/nommu.c
--- linux-2.5.70-uc0/mm/nommu.c	2003-05-29 19:00:01.000000000 +0200
+++ linux-2.5.x/mm/nommu.c	2003-06-01 23:20:27.000000000 +0200
@@ -17,7 +17,7 @@
 #include <linux/pagemap.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
-#include <linux/blkdev.h>
+#include <linux/file.h>
 
 #include <asm/pgalloc.h>
 #include <asm/uaccess.h>
@@ -26,6 +26,8 @@
 
 /* Define to enable debug output for mm */
 #undef DEBUG
+#undef DEBUG_TBLOCK
+#undef WARN_ON_SLACK
 
 void *high_memory;
 struct page *mem_map = NULL;
@@ -253,9 +255,9 @@
 #undef _trans
 }
 
-#ifdef DEBUG
 static void show_process_blocks(void)
 {
+#ifdef DEBUG_TBLOCK
 	struct mm_tblock_struct *tblock;
 
 	printk("Process blocks %d:", current->pid);
@@ -266,8 +268,120 @@
 			printk(" (%d @%p #%d)", kobjsize(tblock->rblock->kblock), tblock->rblock->kblock, tblock->rblock->refcount);
 		printk(tblock->next ? " ->" : ".\n");
 	}
+#endif /* DEBUG_TBLOCK */
+}
+
+static struct mm_tblock_struct *create_tblock(struct mm_struct *mm, void *kblock, size_t size)
+{
+	struct mm_tblock_struct *tblock;
+
+	tblock = (struct mm_tblock_struct *)
+		kmalloc(sizeof(struct mm_tblock_struct), GFP_KERNEL);
+
+	if (!tblock) {
+		printk("Allocation of tblock for %u byte allocation from process %d failed\n",
+			size, current->pid);
+		show_free_areas();
+		return NULL;
+	}
+
+	tblock->rblock = (struct mm_rblock_struct *)
+		kmalloc(sizeof(struct mm_rblock_struct), GFP_KERNEL);
+
+	if (!tblock->rblock) {
+		printk("Allocation of rblock for %u byte allocation from process %d failed\n",
+			size, current->pid);
+		show_free_areas();
+		kfree(tblock);
+		return NULL;
+	}
+
+	if (!kblock)
+	{
+		if (!(kblock = kmalloc(size, GFP_KERNEL))) {
+			printk("Allocation of %u bytes from process %d failed\n",
+				size, current->pid);
+			show_free_areas();
+			kfree(tblock->rblock);
+			kfree(tblock);
+			return NULL;
+		}
+
+		memset(kblock, '\0', size);
+
+		realalloc += kobjsize(kblock);
+		askedalloc += size;
+	}
+
+	/* Init rblock */
+	tblock->rblock->refcount = 1;
+	tblock->rblock->file = NULL;
+	tblock->rblock->kblock = kblock;
+	tblock->rblock->size = size;
+
+#ifdef WARN_ON_SLACK
+	if ((size + WARN_ON_SLACK) <= kobjsize(result))
+		printk("Allocation of %u bytes from process %d has %lu bytes of slack\n",
+			size, current->pid, kobjsize(result) - size);
+#endif
+
+	realalloc += kobjsize(tblock);
+	askedalloc += sizeof(struct mm_tblock_struct);
+
+	realalloc += kobjsize(tblock->rblock);
+	askedalloc += sizeof(struct mm_rblock_struct);
+
+	/* Link tblock into mm list */
+	tblock->next = current->mm->context.tblock.next;
+	current->mm->context.tblock.next = tblock;
+
+	return tblock;
+}
+
+void delete_tblock(struct mm_tblock_struct *tblock, struct mm_tblock_struct *prev)
+{
+	if (tblock->rblock) {
+		if (!--tblock->rblock->refcount) {
+			struct file *file;
+
+			if ((file = tblock->rblock->file)) {
+				struct address_space *mapping;
+
+				mapping = file->f_dentry->d_inode->i_mapping;
+
+#ifdef DEBUG
+				printk("delete_tblock(): releasing file with f_count %d, i_mmap_cnt %d, (inode #%ld)\n",
+					file->f_count.counter, mapping->i_mmap_cnt, mapping->host->i_ino);
+#endif
+				if (--mapping->i_mmap_cnt == 0) {
+#ifdef DEBUG
+					printk("delete_tblock(): freeing mmapped block @%p (inode #%ld).\n",
+						mapping->i_mmap_block, mapping->host->i_ino);
+#endif
+					kfree(mapping->i_mmap_block);
+					mapping->i_mmap_block = NULL;
+				}
+
+				fput(file);
+			}
+			else if (tblock->rblock->kblock) {
+				realalloc -= kobjsize(tblock->rblock->kblock);
+				askedalloc -= tblock->rblock->size;
+				kfree(tblock->rblock->kblock);
+			}
+
+			realalloc -= kobjsize(tblock->rblock);
+			askedalloc -= sizeof(struct mm_rblock_struct);
+			kfree(tblock->rblock);
+		}
+	}
+
+	/* Unlink tblock from mm list */
+	prev->next = tblock->next;
+	realalloc -= kobjsize(tblock);
+	askedalloc -= sizeof(struct mm_tblock_struct);
+	kfree(tblock);
 }
-#endif /* DEBUG */
 
 unsigned long do_mmap_pgoff(
 	struct file * file,
@@ -277,7 +391,6 @@
 	unsigned long flags,
 	unsigned long pgoff)
 {
-	void * result;
 	struct mm_tblock_struct * tblock;
 	unsigned int vm_flags;
 
@@ -351,95 +464,42 @@
 		   or do something truly complicated. */
 		   
 		if (file->f_op->mmap) {
+			get_file(file);
 			error = file->f_op->mmap(file, &vma);
-				   
 #ifdef DEBUG
 			printk("f_op->mmap() returned %d/%lx\n", error, vma.vm_start);
 #endif
-			if (!error)
-				return vma.vm_start;
-			else if (error != -ENOSYS)
+			if (error)
+			{
+				fput(file);
 				return error;
-		} else
-			return -ENODEV; /* No mapping operations defined */
-
-		/* An ENOSYS error indicates that mmap isn't possible (as opposed to
-		   tried but failed) so we'll fall through to the copy. */
-	}
+			}
 
-	tblock = (struct mm_tblock_struct *)
-                        kmalloc(sizeof(struct mm_tblock_struct), GFP_KERNEL);
-	if (!tblock) {
-		printk("Allocation of tblock for %lu byte allocation from process %d failed\n", len, current->pid);
-		show_free_areas();
-		return -ENOMEM;
-	}
+			if (!(tblock = create_tblock(current->mm, (void *)vma.vm_start, vma.vm_end - vma.vm_start)))
+			{
+				fput(file);
+				return -ENOMEM;
+			}
 
-	tblock->rblock = (struct mm_rblock_struct *)
-			kmalloc(sizeof(struct mm_rblock_struct), GFP_KERNEL);
+			tblock->rblock->file = file;
 
-	if (!tblock->rblock) {
-		printk("Allocation of rblock for %lu byte allocation from process %d failed\n", len, current->pid);
-		show_free_areas();
-		kfree(tblock);
-		return -ENOMEM;
-	}
-
-	result = kmalloc(len, GFP_KERNEL);
-	if (!result) {
-		printk("Allocation of length %lu from process %d failed\n", len,
-				current->pid);
-		show_free_areas();
-		kfree(tblock->rblock);
-		kfree(tblock);
-		return -ENOMEM;
+		} else
+			return -ENODEV; /* No mapping operations defined */
 	}
-
-	tblock->rblock->refcount = 1;
-	tblock->rblock->kblock = result;
-	tblock->rblock->size = len;
-	
-	realalloc += kobjsize(result);
-	askedalloc += len;
-
-#ifdef WARN_ON_SLACK	
-	if ((len+WARN_ON_SLACK) <= kobjsize(result))
-		printk("Allocation of %lu bytes from process %d has %lu bytes of slack\n", len, current->pid, kobjsize(result)-len);
-#endif
-	
-	if (file) {
-		int error;
-		mm_segment_t old_fs = get_fs();
-		set_fs(KERNEL_DS);
-		error = file->f_op->read(file, (char *) result, len, &file->f_pos);
-		set_fs(old_fs);
-		if (error < 0) {
-			kfree(result);
-			kfree(tblock->rblock);
-			kfree(tblock);
-			return error;
-		}
-		if (error < len)
-			memset(result+error, '\0', len-error);
-	} else {
-		memset(result, '\0', len);
+	else
+	{
+		/* Handle anonymous mapping */
+		if (!(tblock = create_tblock(current->mm, NULL, len)))
+			return -ENOMEM;
 	}
 
-	realalloc += kobjsize(tblock);
-	askedalloc += sizeof(struct mm_tblock_struct);
-
-	realalloc += kobjsize(tblock->rblock);
-	askedalloc += sizeof(struct mm_rblock_struct);
-
-	tblock->next = current->mm->context.tblock.next;
-	current->mm->context.tblock.next = tblock;
-
 #ifdef DEBUG
 	printk("do_mmap:\n");
 	show_process_blocks();
-#endif	  
+#endif
 
-	return (unsigned long)result;
+	/* Return addr of mmapped memory block */
+	return (unsigned long) tblock->rblock->kblock;
 }
 
 int do_munmap(struct mm_struct * mm, unsigned long addr, size_t len)
@@ -469,27 +529,9 @@
 				current->pid, current->comm, (void*)addr);
 		return -EINVAL;
 	}
-	if (tblock->rblock) {
-		if (!--tblock->rblock->refcount) {
-			if (tblock->rblock->kblock) {
-				realalloc -= kobjsize(tblock->rblock->kblock);
-				askedalloc -= tblock->rblock->size;
-				kfree(tblock->rblock->kblock);
-			}
-			
-			realalloc -= kobjsize(tblock->rblock);
-			askedalloc -= sizeof(struct mm_rblock_struct);
-			kfree(tblock->rblock);
-		}
-	}
-	tmp->next = tblock->next;
-	realalloc -= kobjsize(tblock);
-	askedalloc -= sizeof(struct mm_tblock_struct);
-	kfree(tblock);
 
-#ifdef DEBUG
+	delete_tblock(tblock, tmp);
 	show_process_blocks();
-#endif	  
 
 	return -EINVAL;
 }
@@ -507,28 +549,8 @@
 #endif
 
 	while((tmp = mm->context.tblock.next)) {
-		if (tmp->rblock) {
-			if (!--tmp->rblock->refcount) {
-				if (tmp->rblock->kblock) {
-					realalloc -= kobjsize(tmp->rblock->kblock);
-					askedalloc -= tmp->rblock->size;
-					kfree(tmp->rblock->kblock);
-				}
-				realalloc -= kobjsize(tmp->rblock);
-				askedalloc -= sizeof(struct mm_rblock_struct);
-				kfree(tmp->rblock);
-			}
-			tmp->rblock = 0;
-		}
-		mm->context.tblock.next = tmp->next;
-		realalloc -= kobjsize(tmp);
-		askedalloc -= sizeof(struct mm_tblock_struct);
-		kfree(tmp);
+		delete_tblock(mm->context.tblock.next, &mm->context.tblock);
 	}
-
-#ifdef DEBUG
-	show_process_blocks();
-#endif	  
 }
 
 asmlinkage long sys_munmap(unsigned long addr, size_t len)
diff -Nur linux-2.5.70-uc0/mm/vmscan.c linux-2.5.x/mm/vmscan.c
--- linux-2.5.70-uc0/mm/vmscan.c	2003-05-27 03:00:24.000000000 +0200
+++ linux-2.5.x/mm/vmscan.c	2003-05-31 21:27:32.000000000 +0200
@@ -189,11 +189,13 @@
 	if (PageSwapCache(page))
 		return 1;
 
+#ifdef CONFIG_MMU
 	/* File is mmap'd by somebody. */
 	if (!list_empty(&mapping->i_mmap))
 		return 1;
 	if (!list_empty(&mapping->i_mmap_shared))
 		return 1;
+#endif /* CONFIG_MMU */
 
 	return 0;
 }
diff -Nur linux-2.5.70-uc0/fs/inode.c linux-2.5.x/fs/inode.c
--- linux-2.5.70-uc0/fs/inode.c	2003-05-27 03:01:00.000000000 +0200
+++ linux-2.5.x/fs/inode.c	2003-06-01 19:24:40.000000000 +0200
@@ -146,6 +146,12 @@
 		mapping->dirtied_when = 0;
 		mapping->assoc_mapping = NULL;
 		mapping->backing_dev_info = &default_backing_dev_info;
+
+#ifndef CONFIG_MMU
+		mapping->i_mmap_block = NULL;
+		mapping->i_mmap_cnt = 0;
+#endif /* !CONFIG_MMU */
+
 		if (sb->s_bdev)
 			mapping->backing_dev_info = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
 		memset(&inode->u, 0, sizeof(inode->u));
@@ -184,11 +190,13 @@
 	sema_init(&inode->i_sem, 1);
 	INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
 	spin_lock_init(&inode->i_data.page_lock);
-	init_MUTEX(&inode->i_data.i_shared_sem);
 	INIT_LIST_HEAD(&inode->i_data.private_list);
 	spin_lock_init(&inode->i_data.private_lock);
+#ifdef CONFIG_MMU
 	INIT_LIST_HEAD(&inode->i_data.i_mmap);
 	INIT_LIST_HEAD(&inode->i_data.i_mmap_shared);
+	init_MUTEX(&inode->i_data.i_shared_sem);
+#endif /* CONFIG_MMU */
 	spin_lock_init(&inode->i_lock);
 }
 
diff -Nur linux-2.5.70-uc0/fs/locks.c linux-2.5.x/fs/locks.c
--- linux-2.5.70-uc0/fs/locks.c	2003-05-27 03:00:58.000000000 +0200
+++ linux-2.5.x/fs/locks.c	2003-05-31 21:30:47.000000000 +0200
@@ -1423,6 +1423,7 @@
 
 	inode = filp->f_dentry->d_inode;
 
+#ifdef CONFIG_MMU
 	/* Don't allow mandatory locks on files that may be memory mapped
 	 * and shared.
 	 */
@@ -1435,6 +1436,7 @@
 			goto out;
 		}
 	}
+#endif /* CONFIG_MMU */
 
 	error = flock_to_posix_lock(filp, file_lock, &flock);
 	if (error)
@@ -1561,6 +1563,7 @@
 
 	inode = filp->f_dentry->d_inode;
 
+#ifdef CONFIG_MMU
 	/* Don't allow mandatory locks on files that may be memory mapped
 	 * and shared.
 	 */
@@ -1573,6 +1576,7 @@
 			goto out;
 		}
 	}
+#endif /* CONFIG_MMU */
 
 	error = flock64_to_posix_lock(filp, file_lock, &flock);
 	if (error)
diff -Nur linux-2.5.70-uc0/include/linux/flat.h linux-2.5.x/include/linux/flat.h
--- linux-2.5.70-uc0/include/linux/flat.h	2003-05-29 19:00:01.000000000 +0200
+++ linux-2.5.x/include/linux/flat.h	2003-06-01 18:34:35.000000000 +0200
@@ -10,6 +10,9 @@
 #ifndef _LINUX_FLAT_H
 #define _LINUX_FLAT_H
 
+/* FIXME: missing in Kconfig */
+#define CONFIG_BINFMT_SHARED_FLAT
+
 #include <asm/flat.h>
 
 #define	FLAT_VERSION			0x00000004L
