Backport of upstream commit db7d62c8d5:
Avoid attempting to mmap memory from an offset that is not a multiple of
the system page size on systems with page sizes larger than 32KB.
https://www.sqlite.org/src/info/db7d62c8d58eb1e8654a762c9b199ae4e2759038
Index: src/os_unix.c
==================================================================
--- src/os_unix.c
+++ src/os_unix.c
@@ -321,10 +321,11 @@
return geteuid() ? 0 : fchown(fd,uid,gid);
}
/* Forward reference */
static int openDirectory(const char*, int*);
+static int unixGetpagesize(void);
/*
** Many system calls are accessed through pointer-to-functions so that
** they may be overridden at runtime to facilitate fault injection during
** testing and sandboxing. The following array holds the names and pointers
@@ -443,10 +444,13 @@
{ "mremap", (sqlite3_syscall_ptr)mremap, 0 },
#else
{ "mremap", (sqlite3_syscall_ptr)0, 0 },
#endif
#define osMremap ((void*(*)(void*,size_t,size_t,int,...))aSyscall[23].pCurrent)
+
+ { "getpagesize", (sqlite3_syscall_ptr)unixGetpagesize, 0 },
+#define osGetpagesize ((int(*)(void))aSyscall[24].pCurrent)
}; /* End of the overrideable system calls */
/*
** This is the xSetSystemCall() method of sqlite3_vfs for all of the
@@ -4103,10 +4107,40 @@
#endif
return rc;
}
+/*
+** Return the system page size.
+**
+** This function should not be called directly by other code in this file.
+** Instead, it should be called via macro osGetpagesize().
+*/
+static int unixGetpagesize(void){
+#if defined(_BSD_SOURCE)
+ return getpagesize();
+#else
+ return (int)sysconf(_SC_PAGESIZE);
+#endif
+}
+
+/*
+** Return the minimum number of 32KB shm regions that should be mapped at
+** a time, assuming that each mapping must be an integer multiple of the
+** current system page-size.
+**
+** Usually, this is 1. The exception seems to be systems that are configured
+** to use 64KB pages - in this case each mapping must cover at least two
+** shm regions.
+*/
+static int unixShmRegionPerMap(void){
+ int shmsz = 32*1024; /* SHM region size */
+ int pgsz = osGetpagesize(); /* System page size */
+ assert( ((pgsz-1)&pgsz)==0 ); /* Page size must be a power of 2 */
+ if( pgsz<shmsz ) return 1;
+ return pgsz/shmsz;
+}
/*
** Purge the unixShmNodeList list of all entries with unixShmNode.nRef==0.
**
** This is not a VFS shared-memory method; it is a utility function called
@@ -4114,14 +4148,15 @@
*/
static void unixShmPurge(unixFile *pFd){
unixShmNode *p = pFd->pInode->pShmNode;
assert( unixMutexHeld() );
if( p && p->nRef==0 ){
+ int nShmPerMap = unixShmRegionPerMap();
int i;
assert( p->pInode==pFd->pInode );
sqlite3_mutex_free(p->mutex);
- for(i=0; i<p->nRegion; i++){
+ for(i=0; i<p->nRegion; i+=nShmPerMap){
if( p->h>=0 ){
osMunmap(p->apRegion[i], p->szRegion);
}else{
sqlite3_free(p->apRegion[i]);
}
@@ -4324,10 +4359,12 @@
){
unixFile *pDbFd = (unixFile*)fd;
unixShm *p;
unixShmNode *pShmNode;
int rc = SQLITE_OK;
+ int nShmPerMap = unixShmRegionPerMap();
+ int nReqRegion;
/* If the shared-memory file has not yet been opened, open it now. */
if( pDbFd->pShm==0 ){
rc = unixOpenSharedMemory(pDbFd);
if( rc!=SQLITE_OK ) return rc;
@@ -4339,13 +4376,16 @@
assert( szRegion==pShmNode->szRegion || pShmNode->nRegion==0 );
assert( pShmNode->pInode==pDbFd->pInode );
assert( pShmNode->h>=0 || pDbFd->pInode->bProcessLock==1 );
assert( pShmNode->h<0 || pDbFd->pInode->bProcessLock==0 );
- if( pShmNode->nRegion<=iRegion ){
+ /* Minimum number of regions required to be mapped. */
+ nReqRegion = ((iRegion+nShmPerMap) / nShmPerMap) * nShmPerMap;
+
+ if( pShmNode->nRegion<nReqRegion ){
char **apNew; /* New apRegion[] array */
- int nByte = (iRegion+1)*szRegion; /* Minimum required file size */
+ int nByte = nReqRegion*szRegion; /* Minimum required file size */
struct stat sStat; /* Used by fstat() */
pShmNode->szRegion = szRegion;
if( pShmNode->h>=0 ){
@@ -4390,21 +4430,23 @@
}
}
/* Map the requested memory region into this processes address space. */
apNew = (char **)sqlite3_realloc(
- pShmNode->apRegion, (iRegion+1)*sizeof(char *)
+ pShmNode->apRegion, nReqRegion*sizeof(char *)
);
if( !apNew ){
rc = SQLITE_IOERR_NOMEM;
goto shmpage_out;
}
pShmNode->apRegion = apNew;
- while(pShmNode->nRegion<=iRegion){
+ while( pShmNode->nRegion<nReqRegion ){
+ int nMap = szRegion*nShmPerMap;
+ int i;
void *pMem;
if( pShmNode->h>=0 ){
- pMem = osMmap(0, szRegion,
+ pMem = osMmap(0, nMap,
pShmNode->isReadonly ? PROT_READ : PROT_READ|PROT_WRITE,
MAP_SHARED, pShmNode->h, szRegion*(i64)pShmNode->nRegion
);
if( pMem==MAP_FAILED ){
rc = unixLogError(SQLITE_IOERR_SHMMAP, "mmap", pShmNode->zFilename);
@@ -4416,12 +4458,15 @@
rc = SQLITE_NOMEM;
goto shmpage_out;
}
memset(pMem, 0, szRegion);
}
- pShmNode->apRegion[pShmNode->nRegion] = pMem;
- pShmNode->nRegion++;
+
+ for(i=0; i<nShmPerMap; i++){
+ pShmNode->apRegion[pShmNode->nRegion+i] = &((char*)pMem)[szRegion*i];
+ }
+ pShmNode->nRegion += nShmPerMap;
}
}
shmpage_out:
if( pShmNode->nRegion>iRegion ){
@@ -4631,25 +4676,10 @@
#endif
}
#if SQLITE_MAX_MMAP_SIZE>0
/*
-** Return the system page size.
-*/
-static int unixGetPagesize(void){
-#if HAVE_MREMAP
- return 512;
-#elif defined(_BSD_SOURCE)
- return getpagesize();
-#else
- return (int)sysconf(_SC_PAGESIZE);
-#endif
-}
-#endif /* SQLITE_MAX_MMAP_SIZE>0 */
-
-#if SQLITE_MAX_MMAP_SIZE>0
-/*
** Attempt to set the size of the memory mapping maintained by file
** descriptor pFd to nNew bytes. Any existing mapping is discarded.
**
** If successful, this function sets the following variables:
**
@@ -4680,12 +4712,16 @@
assert( MAP_FAILED!=0 );
if( (pFd->ctrlFlags & UNIXFILE_RDONLY)==0 ) flags |= PROT_WRITE;
if( pOrig ){
- const int szSyspage = unixGetPagesize();
+#if HAVE_MREMAP
+ i64 nReuse = pFd->mmapSize;
+#else
+ const int szSyspage = osGetpagesize();
i64 nReuse = (pFd->mmapSize & ~(szSyspage-1));
+#endif
u8 *pReq = &pOrig[nReuse];
/* Unmap any pages of the existing mapping that cannot be reused. */
if( nReuse!=nOrig ){
osMunmap(pReq, nOrig-nReuse);
@@ -7427,11 +7463,11 @@
};
unsigned int i; /* Loop counter */
/* Double-check that the aSyscall[] array has been constructed
** correctly. See ticket [bb3a86e890c8e96ab] */
- assert( ArraySize(aSyscall)==24 );
+ assert( ArraySize(aSyscall)==25 );
/* Register all VFSes defined in the aVfs[] array */
for(i=0; i<(sizeof(aVfs)/sizeof(sqlite3_vfs)); i++){
sqlite3_vfs_register(&aVfs[i], i==0);
}
Index: src/test_syscall.c
==================================================================
--- src/test_syscall.c
+++ src/test_syscall.c
@@ -65,10 +65,15 @@
** Return true if the named system call exists. Or false otherwise.
**
** test_syscall list
** Return a list of all system calls. The list is constructed using
** the xNextSystemCall() VFS method.
+**
+** test_syscall pagesize PGSZ
+** If PGSZ is a power of two greater than 256, install a wrapper around
+** OS function getpagesize() that reports the system page size as PGSZ.
+** Or, if PGSZ is less than zero, remove any wrapper already installed.
*/
#include "sqliteInt.h"
#include "sqlite3.h"
#include "tcl.h"
@@ -87,11 +92,13 @@
static struct TestSyscallGlobal {
int bPersist; /* 1 for persistent errors, 0 for transient */
int nCount; /* Fail after this many more calls */
int nFail; /* Number of failures that have occurred */
-} gSyscall = { 0, 0 };
+ int pgsz;
+ sqlite3_syscall_ptr orig_getpagesize;
+} gSyscall = { 0, 0, 0, 0, 0 };
static int ts_open(const char *, int, int);
static int ts_close(int fd);
static int ts_access(const char *zPath, int mode);
static char *ts_getcwd(char *zPath, size_t nPath);
@@ -647,10 +654,49 @@
pVfs = sqlite3_vfs_find(0);
Tcl_SetObjResult(interp, Tcl_NewStringObj(pVfs->zName, -1));
return TCL_OK;
}
+
+static int ts_getpagesize(void){
+ return gSyscall.pgsz;
+}
+
+static int test_syscall_pagesize(
+ void * clientData,
+ Tcl_Interp *interp,
+ int objc,
+ Tcl_Obj *CONST objv[]
+){
+ sqlite3_vfs *pVfs = sqlite3_vfs_find(0);
+ int pgsz;
+ if( objc!=3 ){
+ Tcl_WrongNumArgs(interp, 2, objv, "PGSZ");
+ return TCL_ERROR;
+ }
+ if( Tcl_GetIntFromObj(interp, objv[2], &pgsz) ){
+ return TCL_ERROR;
+ }
+
+ if( pgsz<0 ){
+ if( gSyscall.orig_getpagesize ){
+ pVfs->xSetSystemCall(pVfs, "getpagesize", gSyscall.orig_getpagesize);
+ }
+ }else{
+ if( pgsz<512 || (pgsz & (pgsz-1)) ){
+ Tcl_AppendResult(interp, "pgsz out of range", 0);
+ return TCL_ERROR;
+ }
+ gSyscall.orig_getpagesize = pVfs->xGetSystemCall(pVfs, "getpagesize");
+ gSyscall.pgsz = pgsz;
+ pVfs->xSetSystemCall(
+ pVfs, "getpagesize", (sqlite3_syscall_ptr)ts_getpagesize
+ );
+ }
+
+ return TCL_OK;
+}
static int test_syscall(
void * clientData,
Tcl_Interp *interp,
int objc,
@@ -666,10 +712,11 @@
{ "reset", test_syscall_reset },
{ "errno", test_syscall_errno },
{ "exists", test_syscall_exists },
{ "list", test_syscall_list },
{ "defaultvfs", test_syscall_defaultvfs },
+ { "pagesize", test_syscall_pagesize },
{ 0, 0 }
};
int iCmd;
int rc;
Index: test/syscall.test
==================================================================
--- test/syscall.test
+++ test/syscall.test
@@ -59,10 +59,11 @@
foreach s {
open close access getcwd stat fstat ftruncate
fcntl read pread write pwrite fchmod fallocate
pread64 pwrite64 unlink openDirectory mkdir rmdir
statvfs fchown umask mmap munmap mremap
+ getpagesize
} {
if {[test_syscall exists $s]} {lappend syscall_list $s}
}
do_test 3.1 { lsort [test_syscall list] } [lsort $syscall_list]
ADDED test/wal64k.test
Index: test/wal64k.test
==================================================================
--- test/wal64k.test
+++ test/wal64k.test
@@ -0,0 +1,47 @@
+# 2010 April 13
+#
+# The author disclaims copyright to this source code. In place of
+# a legal notice, here is a blessing:
+#
+# May you do good and not evil.
+# May you find forgiveness for yourself and forgive others.
+# May you share freely, never taking more than you give.
+#
+#***********************************************************************
+# This file implements regression tests for SQLite library. The
+# focus of this file is testing the operation of the library in
+# "PRAGMA journal_mode=WAL" mode.
+#
+
+set testdir [file dirname $argv0]
+source $testdir/tester.tcl
+set testprefix wal64k
+
+ifcapable !wal {finish_test ; return }
+
+db close
+test_syscall pagesize 65536
+sqlite3 db test.db
+
+do_execsql_test 1.0 {
+ PRAGMA journal_mode = WAL;
+ CREATE TABLE t1(x);
+ CREATE INDEX i1 ON t1(x);
+} {wal}
+do_test 1.1 { file size test.db-shm } {65536}
+
+do_test 1.2 {
+ execsql BEGIN
+ while {[file size test.db-shm]==65536} {
+ execsql { INSERT INTO t1 VALUES( randstr(900,1100) ) }
+ }
+ execsql COMMIT
+ file size test.db-shm
+} {131072}
+
+integrity_check 1.3
+
+db close
+test_syscall pagesize -1
+finish_test
+