typedef struct Config Config; typedef struct AMap AMap; typedef struct AMapN AMapN; typedef struct Arena Arena; typedef struct AState AState; typedef struct ArenaCIG ArenaCIG; typedef struct ArenaHead ArenaHead; typedef struct ArenaPart ArenaPart; typedef struct ArenaTail ArenaTail; typedef struct ATailStats ATailStats; typedef struct CIBlock CIBlock; typedef struct Clump Clump; typedef struct ClumpInfo ClumpInfo; typedef struct Graph Graph; typedef struct IAddr IAddr; typedef struct IBucket IBucket; typedef struct IEStream IEStream; typedef struct IEntry IEntry; typedef struct IFile IFile; typedef struct ISect ISect; typedef struct Index Index; typedef struct Lump Lump; typedef struct DBlock DBlock; typedef struct Part Part; typedef struct Statbin Statbin; typedef struct Statdesc Statdesc; typedef struct Stats Stats; typedef struct ZBlock ZBlock; typedef struct Round Round; typedef struct Bloom Bloom; #pragma incomplete IEStream #define TWID32 ((u32int)~(u32int)0) #define TWID64 ((u64int)~(u64int)0) #define TWID8 ((u8int)~(u8int)0) enum { ABlockLog = 9, /* log2(512), the quantum for reading arenas */ ANameSize = 64, MaxDiskBlock = 64*1024, /* max. allowed size for a disk block */ MaxIoSize = 64*1024, /* max. allowed size for a disk io operation */ PartBlank = 256*1024, /* untouched section at beginning of partition */ HeadSize = 512, /* size of a header after PartBlank */ MinArenaSize = 1*1024*1024, /* smallest reasonable arena size */ IndexBase = 1024*1024, /* initial address to use in an index */ MaxIo = 64*1024, /* max size of a single read or write operation */ ICacheBits = 16, /* default bits for indexing icache */ MaxAMap = 2*1024, /* max. allowed arenas in an address mapping; must be < 32*1024 */ /* * return codes from syncarena */ SyncDataErr = 1 << 0, /* problem reading the clump data */ SyncCIErr = 1 << 1, /* found erroneous clump directory entries */ SyncCIZero = 1 << 2, /* found unwritten clump directory entries */ SyncFixErr = 1 << 3, /* error writing fixed data */ SyncHeader = 1 << 4, /* altered header fields */ /* * error severity */ EOk = 0, /* error expected in normal operation */ EStrange, /* strange error that should be logged */ ECorrupt, /* corrupted data found in arenas */ EICorrupt, /* corrupted data found in index */ EAdmin, /* should be brought to administrators' attention */ ECrash, /* really bad internal error */ EBug, /* a limitation which should be fixed */ EInconsist, /* inconsistencies between index and arena */ EMax, /* * internal disk formats for the venti archival storage system */ /* * magic numbers on disk */ _ClumpMagic = 0xd15cb10cU, /* clump header, deprecated */ ClumpFreeMagic = 0, /* free clump; terminates active clump log */ ArenaPartMagic = 0xa9e4a5e7U, /* arena partition header */ ArenaMagic = 0xf2a14eadU, /* arena trailer */ ArenaHeadMagic = 0xd15c4eadU, /* arena header */ BloomMagic = 0xb1004eadU, /* bloom filter header */ BloomMaxHash = 32, ISectMagic = 0xd15c5ec7U, /* index header */ ArenaPartVersion = 3, ArenaVersion4 = 4, ArenaVersion5 = 5, BloomVersion = 1, IndexVersion = 1, ISectVersion1 = 1, ISectVersion2 = 2, /* * encodings of clumps on disk */ ClumpEErr = 0, /* can't happen */ ClumpENone, /* plain */ ClumpECompress, /* compressed */ ClumpEMax, /* * sizes in bytes on disk */ U8Size = 1, U16Size = 2, U32Size = 4, U64Size = 8, ArenaPartSize = 4 * U32Size, ArenaSize4 = 2 * U64Size + 6 * U32Size + ANameSize + U8Size, ArenaSize5 = ArenaSize4 + U32Size, ArenaSize5a = ArenaSize5 + 2 * U8Size + 2 * U32Size + 2 * U64Size, ArenaHeadSize4 = U64Size + 3 * U32Size + ANameSize, ArenaHeadSize5 = ArenaHeadSize4 + U32Size, BloomHeadSize = 4 * U32Size, ISectSize1 = 7 * U32Size + 2 * ANameSize, ISectSize2 = ISectSize1 + U32Size, ClumpInfoSize = U8Size + 2 * U16Size + VtScoreSize, ClumpSize = ClumpInfoSize + U8Size + 3 * U32Size, MaxBloomSize = 1<<(32-3), /* 2^32 bits */ MaxBloomHash = 32, /* bits per score */ /* * BUG - The various block copies that manipulate entry buckets * would be faster if we bumped IBucketSize up to 8 and IEntrySize up to 40, * so that everything is word-aligned. Buildindex is actually cpu-bound * by the (byte at a time) copying in qsort. */ IBucketSize = U32Size + U16Size, IEntrySize = U64Size + U32Size + 2*U16Size + 2*U8Size + VtScoreSize, IEntryTypeOff = VtScoreSize + U32Size + U16Size + U64Size + U16Size, IEntryAddrOff = VtScoreSize + U32Size + U16Size, MaxClumpBlocks = (VtMaxLumpSize + ClumpSize + (1 << ABlockLog) - 1) >> ABlockLog, IcacheFrac = 1000000, /* denominator */ SleepForever = 1000000000, /* magic value for sleep time */ /* * dirty flags - order controls disk write order */ DirtyArena = 1, DirtyArenaCib, DirtyArenaTrailer, DirtyMax, ArenaCIGSize = 10*1024, // about 0.5 MB worth of IEntry. VentiZZZZZZZZ }; extern char TraceDisk[]; extern char TraceLump[]; extern char TraceBlock[]; extern char TraceProc[]; extern char TraceWork[]; extern char TraceQuiet[]; extern char TraceRpc[]; /* * results of parsing and initializing a config file */ struct Config { char *index; /* name of the index to initialize */ int naparts; /* arena partitions initialized */ ArenaPart **aparts; int nsects; /* index sections initialized */ ISect **sects; Bloom *bloom; /* bloom filter */ u32int bcmem; u32int mem; u32int icmem; int queuewrites; char* haddr; char* vaddr; char* webroot; }; /* * a Part is the low level interface to files or disks. * there are two main types of partitions * arena paritions, which some number of arenas, each in a sub-partition. * index partition, which only have one subpartition. */ struct Part { int fd; /* rock for accessing the disk */ int mode; u64int offset; u64int size; /* size of the partiton */ u32int blocksize; /* block size for reads and writes */ u32int fsblocksize; /* minimum file system block size */ char *name; char *filename; Channel *writechan; /* chan[dcache.nblock](DBlock*) */ }; /* * a cached block from the partition * yuck -- most of this is internal structure for the cache * all other routines should only use data */ struct DBlock { u8int *data; Part *part; /* partition in which cached */ u64int addr; /* base address on the partition */ u32int size; /* amount of data available, not amount allocated; should go away */ u32int mode; u32int dirty; u32int dirtying; DBlock *next; /* doubly linked hash chains */ DBlock *prev; u32int heap; /* index in heap table */ u32int used; /* last reference times */ u32int used2; u32int ref; /* reference count */ RWLock lock; /* for access to data only */ Channel *writedonechan; void* chanbuf[1]; /* buffer for the chan! */ }; /* * a cached block from the partition * yuck -- most of this is internal structure for the cache * all other routines should only use data * double yuck -- this is mostly the same as a DBlock */ struct Lump { Packet *data; Part *part; /* partition in which cached */ u8int score[VtScoreSize]; /* score of packet */ u8int type; /* type of packet */ u32int size; /* amount of data allocated to hold packet */ Lump *next; /* doubly linked hash chains */ Lump *prev; u32int heap; /* index in heap table */ u32int used; /* last reference times */ u32int used2; u32int ref; /* reference count */ QLock lock; /* for access to data only */ }; /* * mapping between names and address ranges */ struct AMap { u64int start; u64int stop; char name[ANameSize]; }; /* * an AMap along with a length */ struct AMapN { int n; AMap *map; }; /* * an ArenaPart is a partition made up of Arenas * it exists because most os's don't support many partitions, * and we want to have many different Arenas */ struct ArenaPart { Part *part; u64int size; /* size of underlying partition, rounded down to blocks */ Arena **arenas; u32int tabbase; /* base address of arena table on disk */ u32int tabsize; /* max. bytes in arena table */ /* * fields stored on disk */ u32int version; u32int blocksize; /* "optimal" block size for reads and writes */ u32int arenabase; /* base address of first arena */ /* * stored in the arena mapping table on disk */ AMap *map; int narenas; }; /* * info about one block in the clump info cache */ struct CIBlock { u32int block; /* blocks in the directory */ int offset; /* offsets of one clump in the data */ DBlock *data; }; /* * Statistics kept in the tail. */ struct ATailStats { u32int clumps; /* number of clumps */ u32int cclumps; /* number of compressed clumps */ u64int used; u64int uncsize; u8int sealed; }; /* * Arena state - represents a point in the data log */ struct AState { Arena *arena; u64int aa; /* index address */ ATailStats stats; }; /* * an Arena is a log of Clumps, preceeded by an ArenaHeader, * and followed by a Arena, each in one disk block. * struct on disk is not always up to date, but should be self-consistent. * to sync after reboot, follow clumps starting at used until ClumpFreeMagic if found. * * * * * * * * * * * * * * * * * */ struct Arena { QLock lock; /* lock for arena fields, writing to disk */ Part *part; /* partition in which arena lives */ int blocksize; /* size of block to read or write */ u64int base; /* base address on disk */ u64int size; /* total space in the arena */ u8int score[VtScoreSize]; /* score of the entire sealed & summed arena */ int clumpmax; /* ClumpInfos per block */ AState mem; int inqueue; /* * fields stored on disk */ u32int version; char name[ANameSize]; /* text label */ ATailStats memstats; ATailStats diskstats; u32int ctime; /* first time a block was written */ u32int wtime; /* last time a block was written */ u32int clumpmagic; ArenaCIG *cig; int ncig; }; struct ArenaCIG { u64int offset; // from arena base }; /* * redundant storage of some fields at the beginning of each arena */ struct ArenaHead { u32int version; char name[ANameSize]; u32int blocksize; u64int size; u32int clumpmagic; }; /* * most interesting meta information for a clump. * stored in each clump's header and in the Arena's directory, * stored in reverse order just prior to the arena trailer */ struct ClumpInfo { u8int type; u16int size; /* size of disk data, not including header */ u16int uncsize; /* size of uncompressed data */ u8int score[VtScoreSize]; /* score of the uncompressed data only */ }; /* * header for an immutable clump of data */ struct Clump { ClumpInfo info; u8int encoding; u32int creator; /* initial client which wrote the block */ u32int time; /* creation at gmt seconds since 1/1/1970 */ }; /* * index of all clumps according to their score * this is just a wrapper to tie together the index sections * * * * * * * * * * * * * * * * * * * * */ struct Index { u32int div; /* divisor for mapping score to bucket */ u32int buckets; /* last bucket used in disk hash table */ u32int blocksize; u32int tabsize; /* max. bytes in index config */ int mapalloc; /* first arena to check when adding a lump */ Arena **arenas; /* arenas in the mapping */ ISect **sects; /* sections which hold the buckets */ Bloom *bloom; /* bloom filter */ /* * fields stored in config file */ u32int version; char name[ANameSize]; /* text label */ int nsects; AMap *smap; /* mapping of buckets to index sections */ int narenas; AMap *amap; /* mapping from index addesses to arenas */ QLock writing; }; /* * one part of the bucket storage for an index. * the index blocks are sequentially allocated * across all of the sections. */ struct ISect { Part *part; int blocklog; /* log2(blocksize) */ int buckmax; /* max. entries in a index bucket */ u32int tabbase; /* base address of index config table on disk */ u32int tabsize; /* max. bytes in index config */ Channel *writechan; Channel *writedonechan; void *ig; /* used by buildindex only */ int ng; /* * fields stored on disk */ u32int version; u32int bucketmagic; char name[ANameSize]; /* text label */ char index[ANameSize]; /* index owning the section */ u32int blocksize; /* size of hash buckets in index */ u32int blockbase; /* address of start of on disk index table */ u32int blocks; /* total blocks on disk; some may be unused */ u32int start; /* first bucket in this section */ u32int stop; /* limit of buckets in this section */ }; /* * externally interesting part of an IEntry */ struct IAddr { u64int addr; u16int size; /* uncompressed size */ u8int type; /* type of block */ u8int blocks; /* arena io quanta for Clump + data */ }; /* * entries in the index * kept in IBuckets in the disk index table, * cached in the memory ICache. */ struct IEntry { /* on disk data - 32 bytes*/ u8int score[VtScoreSize]; IAddr ia; IEntry *nexthash; IEntry *nextdirty; IEntry *next; IEntry *prev; u8int state; }; enum { IEClean = 0, IEDirty = 1, IESummary = 2, }; /* * buckets in the on disk index table */ struct IBucket { u16int n; /* number of active indices */ u32int buck; /* used by buildindex/checkindex only */ u8int *data; }; /* * temporary buffers used by individual threads */ struct ZBlock { u32int len; u32int _size; u8int *data; u8int *free; }; /* * simple input buffer for a '\0' terminated text file */ struct IFile { char *name; /* name of the file */ ZBlock *b; /* entire contents of file */ u32int pos; /* current position in the file */ }; struct Statdesc { char *name; ulong max; }; /* keep in sync with stats.c:/statdesc and httpd.c:/graphname*/ enum { StatRpcTotal, StatRpcRead, StatRpcReadOk, StatRpcReadFail, StatRpcReadBytes, StatRpcReadTime, StatRpcReadCached, StatRpcReadCachedTime, StatRpcReadUncached, StatRpcReadUncachedTime, StatRpcWrite, StatRpcWriteNew, StatRpcWriteOld, StatRpcWriteFail, StatRpcWriteBytes, StatRpcWriteTime, StatRpcWriteNewTime, StatRpcWriteOldTime, StatLcacheHit, StatLcacheMiss, StatLcacheRead, StatLcacheWrite, StatLcacheSize, StatLcacheStall, StatLcacheReadTime, StatDcacheHit, StatDcacheMiss, StatDcacheLookup, StatDcacheRead, StatDcacheWrite, StatDcacheDirty, StatDcacheSize, StatDcacheFlush, StatDcacheStall, StatDcacheLookupTime, StatDblockStall, StatLumpStall, StatIcacheHit, StatIcacheMiss, StatIcacheRead, StatIcacheWrite, StatIcacheFill, StatIcachePrefetch, StatIcacheDirty, StatIcacheSize, StatIcacheFlush, StatIcacheStall, StatIcacheReadTime, StatIcacheLookup, StatScacheHit, StatScachePrefetch, StatBloomHit, StatBloomMiss, StatBloomFalseMiss, StatBloomLookup, StatBloomOnes, StatBloomBits, StatApartRead, StatApartReadBytes, StatApartWrite, StatApartWriteBytes, StatIsectRead, StatIsectReadBytes, StatIsectWrite, StatIsectWriteBytes, StatSumRead, StatSumReadBytes, StatCigLoad, StatCigLoadTime, NStat }; extern Statdesc statdesc[NStat]; /* * statistics about the operation of the server * mainly for performance monitoring and profiling. */ struct Stats { ulong now; ulong n[NStat]; }; struct Statbin { uint nsamp; uint min; uint max; uint avg; }; struct Graph { long (*fn)(Stats*, Stats*, void*); void *arg; long t0; long t1; long min; long max; long wid; long ht; int fill; }; /* * for kicking background processes that run one round after another after another */ struct Round { QLock lock; Rendez start; Rendez finish; Rendez delaywait; int delaytime; int delaykick; char* name; int last; int current; int next; int doanother; }; /* * Bloom filter of stored block hashes */ struct Bloom { RWLock lk; /* protects nhash, nbits, tab, mb */ QLock mod; /* one marker at a time, protects nb */ int nhash; ulong size; /* bytes in tab */ ulong bitmask; /* to produce bit index */ u8int *data; Part *part; Channel *writechan; Channel *writedonechan; }; extern Index *mainindex; extern u32int maxblocksize; /* max. block size used by any partition */ extern int paranoid; /* should verify hashes on disk read */ extern int queuewrites; /* put all lump writes on a queue and finish later */ extern int readonly; /* only allowed to read the disk data */ extern Stats stats; extern u8int zeroscore[VtScoreSize]; extern int compressblocks; extern int writestodevnull; /* dangerous - for performance debugging */ extern int collectstats; extern QLock memdrawlock; extern int icachesleeptime; extern int minicachesleeptime; extern int arenasumsleeptime; extern int manualscheduling; extern int l0quantum; extern int l1quantum; extern int ignorebloom; extern int icacheprefetch; extern int syncwrites; extern Stats *stathist; extern int nstathist; extern ulong stattime; #ifndef PLAN9PORT #pragma varargck type "V" uchar* #define ODIRECT 0 #endif