c0e712
Backport of upstream commit db7d62c8d5:
c0e712
c0e712
Avoid attempting to mmap memory from an offset that is not a multiple of
c0e712
the system page size on systems with page sizes larger than 32KB.
c0e712
c0e712
https://www.sqlite.org/src/info/db7d62c8d58eb1e8654a762c9b199ae4e2759038
c0e712
c0e712
Index: src/os_unix.c
c0e712
==================================================================
c0e712
--- src/os_unix.c
c0e712
+++ src/os_unix.c
c0e712
@@ -321,10 +321,11 @@
c0e712
   return geteuid() ? 0 : fchown(fd,uid,gid);
c0e712
 }
c0e712
 
c0e712
 /* Forward reference */
c0e712
 static int openDirectory(const char*, int*);
c0e712
+static int unixGetpagesize(void);
c0e712
 
c0e712
 /*
c0e712
 ** Many system calls are accessed through pointer-to-functions so that
c0e712
 ** they may be overridden at runtime to facilitate fault injection during
c0e712
 ** testing and sandboxing.  The following array holds the names and pointers
c0e712
@@ -443,10 +444,13 @@
c0e712
   { "mremap",       (sqlite3_syscall_ptr)mremap,          0 },
c0e712
 #else
c0e712
   { "mremap",       (sqlite3_syscall_ptr)0,               0 },
c0e712
 #endif
c0e712
 #define osMremap ((void*(*)(void*,size_t,size_t,int,...))aSyscall[23].pCurrent)
c0e712
+
c0e712
+  { "getpagesize",  (sqlite3_syscall_ptr)unixGetpagesize, 0 },
c0e712
+#define osGetpagesize ((int(*)(void))aSyscall[24].pCurrent)
c0e712
 
c0e712
 }; /* End of the overrideable system calls */
c0e712
 
c0e712
 /*
c0e712
 ** This is the xSetSystemCall() method of sqlite3_vfs for all of the
c0e712
@@ -4103,10 +4107,40 @@
c0e712
 #endif
c0e712
 
c0e712
   return rc;        
c0e712
 }
c0e712
 
c0e712
+/*
c0e712
+** Return the system page size.
c0e712
+**
c0e712
+** This function should not be called directly by other code in this file. 
c0e712
+** Instead, it should be called via macro osGetpagesize().
c0e712
+*/
c0e712
+static int unixGetpagesize(void){
c0e712
+#if defined(_BSD_SOURCE)
c0e712
+  return getpagesize();
c0e712
+#else
c0e712
+  return (int)sysconf(_SC_PAGESIZE);
c0e712
+#endif
c0e712
+}
c0e712
+
c0e712
+/*
c0e712
+** Return the minimum number of 32KB shm regions that should be mapped at
c0e712
+** a time, assuming that each mapping must be an integer multiple of the
c0e712
+** current system page-size.
c0e712
+**
c0e712
+** Usually, this is 1. The exception seems to be systems that are configured
c0e712
+** to use 64KB pages - in this case each mapping must cover at least two
c0e712
+** shm regions.
c0e712
+*/
c0e712
+static int unixShmRegionPerMap(void){
c0e712
+  int shmsz = 32*1024;            /* SHM region size */
c0e712
+  int pgsz = osGetpagesize();   /* System page size */
c0e712
+  assert( ((pgsz-1)&pgsz)==0 );   /* Page size must be a power of 2 */
c0e712
+  if( pgsz
c0e712
+  return pgsz/shmsz;
c0e712
+}
c0e712
 
c0e712
 /*
c0e712
 ** Purge the unixShmNodeList list of all entries with unixShmNode.nRef==0.
c0e712
 **
c0e712
 ** This is not a VFS shared-memory method; it is a utility function called
c0e712
@@ -4114,14 +4148,15 @@
c0e712
 */
c0e712
 static void unixShmPurge(unixFile *pFd){
c0e712
   unixShmNode *p = pFd->pInode->pShmNode;
c0e712
   assert( unixMutexHeld() );
c0e712
   if( p && p->nRef==0 ){
c0e712
+    int nShmPerMap = unixShmRegionPerMap();
c0e712
     int i;
c0e712
     assert( p->pInode==pFd->pInode );
c0e712
     sqlite3_mutex_free(p->mutex);
c0e712
-    for(i=0; i<p->nRegion; i++){
c0e712
+    for(i=0; i<p->nRegion; i+=nShmPerMap){
c0e712
       if( p->h>=0 ){
c0e712
         osMunmap(p->apRegion[i], p->szRegion);
c0e712
       }else{
c0e712
         sqlite3_free(p->apRegion[i]);
c0e712
       }
c0e712
@@ -4324,10 +4359,12 @@
c0e712
 ){
c0e712
   unixFile *pDbFd = (unixFile*)fd;
c0e712
   unixShm *p;
c0e712
   unixShmNode *pShmNode;
c0e712
   int rc = SQLITE_OK;
c0e712
+  int nShmPerMap = unixShmRegionPerMap();
c0e712
+  int nReqRegion;
c0e712
 
c0e712
   /* If the shared-memory file has not yet been opened, open it now. */
c0e712
   if( pDbFd->pShm==0 ){
c0e712
     rc = unixOpenSharedMemory(pDbFd);
c0e712
     if( rc!=SQLITE_OK ) return rc;
c0e712
@@ -4339,13 +4376,16 @@
c0e712
   assert( szRegion==pShmNode->szRegion || pShmNode->nRegion==0 );
c0e712
   assert( pShmNode->pInode==pDbFd->pInode );
c0e712
   assert( pShmNode->h>=0 || pDbFd->pInode->bProcessLock==1 );
c0e712
   assert( pShmNode->h<0 || pDbFd->pInode->bProcessLock==0 );
c0e712
 
c0e712
-  if( pShmNode->nRegion<=iRegion ){
c0e712
+  /* Minimum number of regions required to be mapped. */
c0e712
+  nReqRegion = ((iRegion+nShmPerMap) / nShmPerMap) * nShmPerMap;
c0e712
+
c0e712
+  if( pShmNode->nRegion
c0e712
     char **apNew;                      /* New apRegion[] array */
c0e712
-    int nByte = (iRegion+1)*szRegion;  /* Minimum required file size */
c0e712
+    int nByte = nReqRegion*szRegion;   /* Minimum required file size */
c0e712
     struct stat sStat;                 /* Used by fstat() */
c0e712
 
c0e712
     pShmNode->szRegion = szRegion;
c0e712
 
c0e712
     if( pShmNode->h>=0 ){
c0e712
@@ -4390,21 +4430,23 @@
c0e712
       }
c0e712
     }
c0e712
 
c0e712
     /* Map the requested memory region into this processes address space. */
c0e712
     apNew = (char **)sqlite3_realloc(
c0e712
-        pShmNode->apRegion, (iRegion+1)*sizeof(char *)
c0e712
+        pShmNode->apRegion, nReqRegion*sizeof(char *)
c0e712
     );
c0e712
     if( !apNew ){
c0e712
       rc = SQLITE_IOERR_NOMEM;
c0e712
       goto shmpage_out;
c0e712
     }
c0e712
     pShmNode->apRegion = apNew;
c0e712
-    while(pShmNode->nRegion<=iRegion){
c0e712
+    while( pShmNode->nRegion
c0e712
+      int nMap = szRegion*nShmPerMap;
c0e712
+      int i;
c0e712
       void *pMem;
c0e712
       if( pShmNode->h>=0 ){
c0e712
-        pMem = osMmap(0, szRegion,
c0e712
+        pMem = osMmap(0, nMap,
c0e712
             pShmNode->isReadonly ? PROT_READ : PROT_READ|PROT_WRITE, 
c0e712
             MAP_SHARED, pShmNode->h, szRegion*(i64)pShmNode->nRegion
c0e712
         );
c0e712
         if( pMem==MAP_FAILED ){
c0e712
           rc = unixLogError(SQLITE_IOERR_SHMMAP, "mmap", pShmNode->zFilename);
c0e712
@@ -4416,12 +4458,15 @@
c0e712
           rc = SQLITE_NOMEM;
c0e712
           goto shmpage_out;
c0e712
         }
c0e712
         memset(pMem, 0, szRegion);
c0e712
       }
c0e712
-      pShmNode->apRegion[pShmNode->nRegion] = pMem;
c0e712
-      pShmNode->nRegion++;
c0e712
+
c0e712
+      for(i=0; i
c0e712
+        pShmNode->apRegion[pShmNode->nRegion+i] = &((char*)pMem)[szRegion*i];
c0e712
+      }
c0e712
+      pShmNode->nRegion += nShmPerMap;
c0e712
     }
c0e712
   }
c0e712
 
c0e712
 shmpage_out:
c0e712
   if( pShmNode->nRegion>iRegion ){
c0e712
@@ -4631,25 +4676,10 @@
c0e712
 #endif
c0e712
 }
c0e712
 
c0e712
 #if SQLITE_MAX_MMAP_SIZE>0
c0e712
 /*
c0e712
-** Return the system page size.
c0e712
-*/
c0e712
-static int unixGetPagesize(void){
c0e712
-#if HAVE_MREMAP
c0e712
-  return 512;
c0e712
-#elif defined(_BSD_SOURCE)
c0e712
-  return getpagesize();
c0e712
-#else
c0e712
-  return (int)sysconf(_SC_PAGESIZE);
c0e712
-#endif
c0e712
-}
c0e712
-#endif /* SQLITE_MAX_MMAP_SIZE>0 */
c0e712
-
c0e712
-#if SQLITE_MAX_MMAP_SIZE>0
c0e712
-/*
c0e712
 ** Attempt to set the size of the memory mapping maintained by file 
c0e712
 ** descriptor pFd to nNew bytes. Any existing mapping is discarded.
c0e712
 **
c0e712
 ** If successful, this function sets the following variables:
c0e712
 **
c0e712
@@ -4680,12 +4712,16 @@
c0e712
   assert( MAP_FAILED!=0 );
c0e712
 
c0e712
   if( (pFd->ctrlFlags & UNIXFILE_RDONLY)==0 ) flags |= PROT_WRITE;
c0e712
 
c0e712
   if( pOrig ){
c0e712
-    const int szSyspage = unixGetPagesize();
c0e712
+#if HAVE_MREMAP
c0e712
+    i64 nReuse = pFd->mmapSize;
c0e712
+#else
c0e712
+    const int szSyspage = osGetpagesize();
c0e712
     i64 nReuse = (pFd->mmapSize & ~(szSyspage-1));
c0e712
+#endif
c0e712
     u8 *pReq = &pOrig[nReuse];
c0e712
 
c0e712
     /* Unmap any pages of the existing mapping that cannot be reused. */
c0e712
     if( nReuse!=nOrig ){
c0e712
       osMunmap(pReq, nOrig-nReuse);
c0e712
@@ -7427,11 +7463,11 @@
c0e712
   };
c0e712
   unsigned int i;          /* Loop counter */
c0e712
 
c0e712
   /* Double-check that the aSyscall[] array has been constructed
c0e712
   ** correctly.  See ticket [bb3a86e890c8e96ab] */
c0e712
-  assert( ArraySize(aSyscall)==24 );
c0e712
+  assert( ArraySize(aSyscall)==25 );
c0e712
 
c0e712
   /* Register all VFSes defined in the aVfs[] array */
c0e712
   for(i=0; i<(sizeof(aVfs)/sizeof(sqlite3_vfs)); i++){
c0e712
     sqlite3_vfs_register(&aVfs[i], i==0);
c0e712
   }
c0e712
c0e712
Index: src/test_syscall.c
c0e712
==================================================================
c0e712
--- src/test_syscall.c
c0e712
+++ src/test_syscall.c
c0e712
@@ -65,10 +65,15 @@
c0e712
 **     Return true if the named system call exists. Or false otherwise.
c0e712
 **
c0e712
 **   test_syscall list
c0e712
 **     Return a list of all system calls. The list is constructed using
c0e712
 **     the xNextSystemCall() VFS method.
c0e712
+**
c0e712
+**   test_syscall pagesize PGSZ
c0e712
+**     If PGSZ is a power of two greater than 256, install a wrapper around
c0e712
+**     OS function getpagesize() that reports the system page size as PGSZ.
c0e712
+**     Or, if PGSZ is less than zero, remove any wrapper already installed.
c0e712
 */
c0e712
 
c0e712
 #include "sqliteInt.h"
c0e712
 #include "sqlite3.h"
c0e712
 #include "tcl.h"
c0e712
@@ -87,11 +92,13 @@
c0e712
 
c0e712
 static struct TestSyscallGlobal {
c0e712
   int bPersist;                   /* 1 for persistent errors, 0 for transient */
c0e712
   int nCount;                     /* Fail after this many more calls */
c0e712
   int nFail;                      /* Number of failures that have occurred */
c0e712
-} gSyscall = { 0, 0 };
c0e712
+  int pgsz;
c0e712
+  sqlite3_syscall_ptr orig_getpagesize;
c0e712
+} gSyscall = { 0, 0, 0, 0, 0 };
c0e712
 
c0e712
 static int ts_open(const char *, int, int);
c0e712
 static int ts_close(int fd);
c0e712
 static int ts_access(const char *zPath, int mode);
c0e712
 static char *ts_getcwd(char *zPath, size_t nPath);
c0e712
@@ -647,10 +654,49 @@
c0e712
 
c0e712
   pVfs = sqlite3_vfs_find(0);
c0e712
   Tcl_SetObjResult(interp, Tcl_NewStringObj(pVfs->zName, -1));
c0e712
   return TCL_OK;
c0e712
 }
c0e712
+
c0e712
+static int ts_getpagesize(void){
c0e712
+  return gSyscall.pgsz;
c0e712
+}
c0e712
+
c0e712
+static int test_syscall_pagesize(
c0e712
+  void * clientData,
c0e712
+  Tcl_Interp *interp,
c0e712
+  int objc,
c0e712
+  Tcl_Obj *CONST objv[]
c0e712
+){
c0e712
+  sqlite3_vfs *pVfs = sqlite3_vfs_find(0);
c0e712
+  int pgsz;
c0e712
+  if( objc!=3 ){
c0e712
+    Tcl_WrongNumArgs(interp, 2, objv, "PGSZ");
c0e712
+    return TCL_ERROR;
c0e712
+  }
c0e712
+  if( Tcl_GetIntFromObj(interp, objv[2], &pgsz) ){
c0e712
+    return TCL_ERROR;
c0e712
+  }
c0e712
+
c0e712
+  if( pgsz<0 ){
c0e712
+    if( gSyscall.orig_getpagesize ){
c0e712
+      pVfs->xSetSystemCall(pVfs, "getpagesize", gSyscall.orig_getpagesize);
c0e712
+    }
c0e712
+  }else{
c0e712
+    if( pgsz<512 || (pgsz & (pgsz-1)) ){
c0e712
+      Tcl_AppendResult(interp, "pgsz out of range", 0);
c0e712
+      return TCL_ERROR;
c0e712
+    }
c0e712
+    gSyscall.orig_getpagesize = pVfs->xGetSystemCall(pVfs, "getpagesize");
c0e712
+    gSyscall.pgsz = pgsz;
c0e712
+    pVfs->xSetSystemCall(
c0e712
+        pVfs, "getpagesize", (sqlite3_syscall_ptr)ts_getpagesize
c0e712
+    );
c0e712
+  }
c0e712
+
c0e712
+  return TCL_OK;
c0e712
+}
c0e712
 
c0e712
 static int test_syscall(
c0e712
   void * clientData,
c0e712
   Tcl_Interp *interp,
c0e712
   int objc,
c0e712
@@ -666,10 +712,11 @@
c0e712
     { "reset",      test_syscall_reset },
c0e712
     { "errno",      test_syscall_errno },
c0e712
     { "exists",     test_syscall_exists },
c0e712
     { "list",       test_syscall_list },
c0e712
     { "defaultvfs", test_syscall_defaultvfs },
c0e712
+    { "pagesize",   test_syscall_pagesize },
c0e712
     { 0, 0 }
c0e712
   };
c0e712
   int iCmd;
c0e712
   int rc;
c0e712
 
c0e712
c0e712
Index: test/syscall.test
c0e712
==================================================================
c0e712
--- test/syscall.test
c0e712
+++ test/syscall.test
c0e712
@@ -59,10 +59,11 @@
c0e712
 foreach s {
c0e712
     open close access getcwd stat fstat ftruncate
c0e712
     fcntl read pread write pwrite fchmod fallocate
c0e712
     pread64 pwrite64 unlink openDirectory mkdir rmdir 
c0e712
     statvfs fchown umask mmap munmap mremap
c0e712
+    getpagesize
c0e712
 } {
c0e712
   if {[test_syscall exists $s]} {lappend syscall_list $s}
c0e712
 }
c0e712
 do_test 3.1 { lsort [test_syscall list] } [lsort $syscall_list]
c0e712
 
c0e712
c0e712
ADDED   test/wal64k.test
c0e712
Index: test/wal64k.test
c0e712
==================================================================
c0e712
--- test/wal64k.test
c0e712
+++ test/wal64k.test
c0e712
@@ -0,0 +1,47 @@
c0e712
+# 2010 April 13
c0e712
+#
c0e712
+# The author disclaims copyright to this source code.  In place of
c0e712
+# a legal notice, here is a blessing:
c0e712
+#
c0e712
+#    May you do good and not evil.
c0e712
+#    May you find forgiveness for yourself and forgive others.
c0e712
+#    May you share freely, never taking more than you give.
c0e712
+#
c0e712
+#***********************************************************************
c0e712
+# This file implements regression tests for SQLite library.  The
c0e712
+# focus of this file is testing the operation of the library in
c0e712
+# "PRAGMA journal_mode=WAL" mode.
c0e712
+#
c0e712
+
c0e712
+set testdir [file dirname $argv0]
c0e712
+source $testdir/tester.tcl
c0e712
+set testprefix wal64k
c0e712
+
c0e712
+ifcapable !wal {finish_test ; return }
c0e712
+
c0e712
+db close
c0e712
+test_syscall pagesize 65536
c0e712
+sqlite3 db test.db
c0e712
+
c0e712
+do_execsql_test 1.0 { 
c0e712
+  PRAGMA journal_mode = WAL;
c0e712
+  CREATE TABLE t1(x);
c0e712
+  CREATE INDEX i1 ON t1(x);
c0e712
+} {wal}
c0e712
+do_test 1.1 { file size test.db-shm } {65536}
c0e712
+
c0e712
+do_test 1.2 {
c0e712
+  execsql BEGIN
c0e712
+  while {[file size test.db-shm]==65536} {
c0e712
+    execsql { INSERT INTO t1 VALUES( randstr(900,1100) ) }
c0e712
+  }
c0e712
+  execsql COMMIT
c0e712
+  file size test.db-shm
c0e712
+} {131072}
c0e712
+
c0e712
+integrity_check 1.3
c0e712
+
c0e712
+db close
c0e712
+test_syscall pagesize -1
c0e712
+finish_test
c0e712
+
c0e712