program bIndex; uses lists, dos, strings, crt, debug; type TFileTypes = (ftUNKNOWN, ftTXT, ftBIN, ftHTML, ftPAS, ftASM, ftINC, ftH, ftI, ftC, ftCPP, ftLOG, ftFW4, ftBTI); type TFileInfo = packed record fileNameRecordNum : uPtr; wordList : uPtr; timeStamp : uPtr; reserved : uPtr; end; // TFileInfo type pCharArrayPtr = ^pCharArray; pCharArray = array[0..0] of char; type tagType = (inTag, noTag); type PIndexer = ^TIndexer; TIndexer = object private masterBTI : PBTree; fileInfoBTI : PBTree; fileNameDAT : file; fileInfoDAT : file; dataPath : string; startingPath: string; curFileCount : integer; foundFileCount : integer; procedure deleteWordList(fileInfo:TFileInfo); function fileType(const filename:string):TFileTypes; function genUniqueFileName:uInt32; function getTimeStamp(const filename:string):uInt32; procedure parseFile(fileName:string; fileInfo:TFileInfo; ftype:TFileTypes); public constructor init; procedure Index(path, filemask:string); destructor done; end; // TIndex const spinner : array[0..3] of char = '\|/-'; constructor TIndexer.init; var direc : DirStr; fname : NameStr; exten : ExtStr; begin clrscr; masterBTI := NIL; fileInfoBTI := NIL; fsplit(paramstr(0), direc, fname, exten); if (direc = '') then getDir(0, startingPath) else startingPath := direc; dataPath := appendPathDelimiter(startingPath) + 'indicies\'; {$I-} mkdir(dataPath); {$I+} IOResult(); new(masterBTI, init(dataPath+'master.bti', 1024, BT_STRING, NIL)); new(fileInfoBTI, init(dataPath+'fileInfo.bti', 1024, BT_STRING, NIL)); assign(fileInfoDAT, dataPath+'fileInfo.dat'); if (fsearch(dataPath+'fileInfo.dat', '') = '') then rewrite(fileInfoDAT) else reset(fileInfoDAT); assign(fileNameDAT, dataPath+'fileName.dat'); if (fsearch(dataPath+'fileName.dat', '') = '') then rewrite(fileNameDAT) else reset(fileNameDAT); curFileCount := 0; foundFileCount := 0; writeln(' Indexing'); end; // TIndexer.init procedure TIndexer.deleteWordList; var wordBTI, wordListBTI:PBTree; BTSearch:BTreeSearchRec; u:uPtr; begin // To delete the old word list, we need to // walk through each entry in the wordListBTI and delete // the fileNameRecordNum out of the word BTI new(wordListBTI, open(dataPath + hex(fileInfo.wordList.u) + '.bti', NIL)); while (wordListBTI^.GetFirstKey(BTSearch)) do begin new(wordBTI, open(dataPath + hex(int32(BTSearch.key^)) + '.bti', NIL)); wordBTI^.Delete(@fileInfo.fileNameRecordNum.u, u); dispose(wordBTI, done); wordListBTI^.Delete(BTSearch.key, u); wordListBTI^.findClose(BTSearch); end; dispose(wordListBTI, done); end; // TIndexer.deleteWordList function isBinary(const fileName:string):boolean; type int8 = shortInt; int16 = smallInt; int32 = longInt; uInt8 = byte; uInt16 = word; uInt32 = dWord; type uInt8ArrayPtr = ^uInt8Array; uInt8Array = array[ 0..0 ] of uInt8; var f:file; buf:uInt8ArrayPtr; i, bytesRead:integer; invalidCount:integer; begin // assumes the file exists assign(f, fileName); {$I-} reset(f, 1); {$I+} if (IOResult() <> 0) then exit; getMem(buf, 128); if (buf = NIL) then begin writeln('Out of memory'); halt end; blockread(f, buf^, 128, bytesRead); close(f); invalidCount := 0; for i:=0 to bytesRead-1 do if buf^[i] in [0..9, 14..31, 166..255] then inc(invalidCount); freemem(buf, 128); result := invalidCount > 32; end; // isBinary function TIndexer.fileType; type TExtensionTypes = record name : string[5]; ftype : TFileTypes; end; // TExtensionTypes const nameTypes : array[0..11] of TExtensionTypes = ((name:'.TXT'; ftype:ftTXT), (name:'.HTM'; ftype:ftHTML), (name:'.HTML'; ftype:ftHTML), (name:'.PAS'; ftype:ftPAS), (name:'.ASM'; ftype:ftASM), (name:'.INC'; ftype:ftINC), (name:'.C'; ftype:ftC), (name:'.H'; ftype:ftH), (name:'.I'; ftype:ftI), (name:'.CPP'; ftype:ftCPP), (name:'.CC'; ftype:ftCPP), (name:'.LOG'; ftype:ftLOG) //(name:'.FW4'; ftype:ftFW4), //(name:'.BTI'; ftype:ftBTI) ); // nameTypes var exten : string; ftype : TFileTypes; count : integer; ld : integer; begin result := ftUNKNOWN; if (filename = '') then exit; // first try to determine the filetype based on extension ld := lastDelimiter('.', filename); if (ld = 0) then exit; exten := uppercase(copy(filename, ld, length(filename) - ld +1 )); ftype := ftUNKNOWN; for count := low(nameTypes) to high(nameTypes) do if (nameTypes[count].name = exten) then begin if isBinary(filename) then exit; result := nameTypes[count].ftype; exit; end; // scan the file here to try to figure out what it is end; // TIndexer.fileType function TIndexer.genUniqueFileName; var num:uInt32; tries:uInt32; newFile:boolean; begin tries := 0; newFile := FALSE; repeat num := random(maxlongint); newFile := fSearch(dataPath+hex(num)+'.bti', '') = ''; inc(tries); until (newFile) or (tries > 1000); if (tries > 1000) then writeln('Should probably put a new checker here'); result := num; end; // TIndexer.getUniqueFilename function TIndexer.getTimeStamp; var dirInfo:SearchRec; time:longint; begin result := 0; findFirst(filename, anyFile, dirInfo); if (dosError = 0) then result := dirInfo.time; findClose(dirInfo); end; // TIndexer.getTimeStamp procedure TIndexer.parseFile; var s:string; f:file; p:pCharArrayPtr; pSize:dword; curPos:integer; startPos, endPos:integer; u:uPtr; tmpUPtr:uPtr; wordBTI:PBTree; wordListBTI:PBTree; memWordListBTI:PBTree; code:integer; value:int64; bResult:longint; skipWord:boolean; ch:char; tagState:tagType; begin if (fsearch(filename, '') = '') then exit; assign(f, fileName); {$I-} reset(f, 1); {$I+} if (IOResult() <> 0) then exit; pSize := fileSize(f); if (pSize = 0) then begin close(f); exit end; GetMem(p, pSize); if (p = NIL) then exit; blockread(f, p^, pSize, bResult); if (bResult <> pSize) then writeln('Error reading in from file in TIndexer.parseFile'); {$I-} close(f); {$I+} if (IOResult() <> 0) then writeln('Error closing file in TIndexer.parseFile'); // update the spinner gotoxy(length(' Indexing '), 1); write(spinner[foundFileCount and 3]); inc(foundFileCount); new(wordListBTI, init(dataPath+hex(fileInfo.wordList.u)+'.bti', 256, BT_INT32, NIL)); new(memWordListBTI, init('', 256, BT_INT32, NIL)); // create a temp word list if (p^[0] = '<') then tagState := inTag else tagState := noTag; curPos := 0; case ftype of ftHTML: repeat if tagState = inTag then begin if p^[curPos] = '>' then tagState := noTag; p^[curPos] := ' '; end else begin if p^[curPos] = '<' then tagState := inTag; ch := locase(p^[curPos]); if (ch in ['a'..'z', '0'..'9', '''', '@', '-', '_', #128..#165]) then p^[curPos] := ch else p^[curPos] := ' '; end; inc(curPos); until (curPos >= pSize); ftTXT, ftPAS..ftLOG: repeat ch := locase(p^[curPos]); if (ch in ['a'..'z', '0'..'9', '''', '_', #128..#165]) then p^[curPos] := ch else p^[curPos] := ' '; inc(curPos); until (curPos >= pSize); end; // case curPos := 0; startPos := 0; endPos := 0; if p^[0] = ' ' then tagState := noTag else tagState := inTag; repeat case tagState of inTag: begin if p^[curPos] = ' ' then begin //inTag := FALSE; tagState := noTag; endPos := curPos; // strip off beginning and ending single quotes if (p^[startPos] = '''') then inc(startPos); if (p^[endPos-1] = '''') then dec(endPos); if (endPos - startPos > 2) and (endPos - startPos <= 64) then begin length(s) := endPos - startPos; move(p^[startPos], s[1], endPos-startPos); if (s[1] in ['0'..'9']) and (s[length(s)] in ['0'..'9']) then begin val(s, value, code); skipWord := (code = 0); end else skipWord := FALSE; if not skipWord then begin if (not masterBTI^.Find(@s, u)) then begin u.u := genUniqueFileName; masterBTI^.insert(@s, u); end; // check to see if it's in the cached list if (not memWordListBTI^.Find(@u, tmpUPtr)) then begin new(wordBTI, init(dataPath+hex(u.u)+'.bti', 256, BT_INT32, NIL)); wordBTI^.insert(@fileInfo.fileNameRecordNum, u); // note that I haven't decided what goes in the value field dispose(wordBTI, done); wordListBTI^.Insert(@u, u); // insert into the word list for this file memWordListBTI^.Insert(@u, u); // insert into the mem word list for this file end; end; // !skipWord end; end; end; // inTag noTag: begin if p^[curPos] <> ' ' then begin startPos := curPos; tagState := inTag; end; end; // noTag else begin writeln('tagState isn''t defined properly'); end; end; {case} inc(curPos); until (curPos >= pSize); dispose(memWordListBTI, done); dispose(wordListBTI, done); FreeMem(p, pSize); end; // TIndexer.parseFile procedure TIndexer.Index; var filename:string; fileInfo:TFileInfo; fileInfoRecordNum:uPtr; dirInfo:SearchRec; curDir:string; skipFile:boolean; ftype:TFileTypes; begin getDir(0, curDir); {$I-} chdir(path); {$I+} if (IOResult() <> 0) then exit; path := appendPathDelimiter(path); findfirst(filemask, Archive + ReadOnly, dirInfo); while dosError = 0 do begin if ((dirInfo.attr and directory) = 0) then begin gotoxy(1, 1); write(spinner[curFileCount and 3]); inc(curFileCount); gotoxy(1,24); clreol; writeln(path+dirInfo.name); // clreol; // writeln('Looking in ', path + dirInfo.name); // fillchar(filename, sizeof(filename), 0); // expand out the filename to the full path filename := lowercase(fexpand(dirInfo.name)); ftype := fileType(filename); if (ftype <> ftUNKNOWN) then begin fileInfoRecordNum.offset := 0; // clear the fileInfo record fillchar(fileInfo, sizeof(fileInfo), 0); // if (not fileNameBTI^.find(@filename, fileNameRecordNum)) then if (not fileInfoBTI^.Find(@filename, fileInfoRecordNum)) then begin // We haven't come across this file yet // fileNameRecordNum represents the offset into the fileNameDAT file // This offset is used as the key field in the file referenced by // the value field in master.bti fileInfo.fileNameRecordNum.offset := filesize(fileNameDAT); // wordList is the (numerical) filename of the file that // holds a list of words in this file. fileInfo.wordList.u := genUniqueFileName(); // timeStamp holds the time of the file. This is used to // determine whether we have already indexed this file. // It has a resolution of 2 seconds. fileInfo.timeStamp.u := getTimeStamp(filename); // reserved for future usage. fileInfo.reserved.offset := 0; // cleared above as well // write out this filename to the fileNameDAT file seek(fileNameDAT, filesize(fileNameDAT)); blockwrite(fileNameDAT, filename, length(filename)+1); fileInfoRecordNum.offset := filesize(fileInfoDAT); // insert the fileInfoRecordNum (offset into the fileInfoDAT file) // into the fileInfoBTI fileInfoBTI^.Insert(@filename, fileInfoRecordNum); // write out this file's info to the fileInfoDAT file seek(fileInfoDAT, fileInfoRecordNum.u); blockwrite(fileInfoDAT, fileInfo, sizeof(fileInfo)); skipFile := FALSE; // don't skip this file end else begin // The file already has been visited. seek(fileInfoDAT, fileInfoRecordNum.u); blockread(fileInfoDAT, fileInfo, sizeof(fileInfo)); skipFile := getTimeStamp(filename) = fileInfo.timeStamp.u; if not skipFile then begin // delete the old word list deleteWordList(fileInfo); // update the timestamp field in the fileInfo record fileInfo.timeStamp.u := getTimeStamp(filename); seek(fileInfoDAT, fileInfoRecordNum.u); blockwrite(fileInfoDAT, fileInfo, sizeof(fileInfo)); end // !skipFile end; if not skipFile then parseFile(filename, fileInfo, ftype); end // if ftype <> ftUNKNOWN end; // if (dirInfo.name[1] <> '.') then Index(path + dirInfo.name, filemask); findNext(dirInfo); end; // now recurse directories findFirst('*.*', Directory, dirInfo); while (dosError = 0) do begin if (dirInfo.name <> '.') and (dirInfo.name <> '..') then if (lowercase(dirInfo.name) <> 'indicies') then Index(path + dirInfo.name, filemask); findNext(dirInfo); end; // while {$I-} chDir(curDir); {$I+} if (IOResult() <> 0) then writeln('Could not restore previous directory. How odd...'); end; // TIndexer.Index destructor TIndexer.done; begin if (masterBTI <> NIL) then dispose(masterBTI, done); if (fileInfoBTI <> NIL) then dispose(fileInfoBTI, done); {$I-} chdir(startingPath); {$I+} if (IOResult() <> 0) then writeln('Problem restoring dir'); writeln; writeln('Done indexing.'); writeln('Looked in ', curFileCount, ' files'); writeln('Indexed ', foundFileCount, ' files'); end; // TIndexer.done procedure usage; begin writeln('Usage:'); writeln('bIndex path'); writeln('Examples:'); writeln('bIndex dir\'); writeln('bIndex dir\filename'); writeln('bIndex dir\files*.*'); writeln('bIndex c:\*.html'); halt; end; // TIndexer.usage function getPath:string; var Direc : DirStr; Fname : NameStr; Exten : ExtStr; begin fsplit(paramstr(1), direc, fname, exten); if (direc = '') then getDir(0, direc); result := appendPathDelimiter(direc); end; // getPath function getFileMask:string; var Direc : DirStr; Fname : NameStr; Exten : ExtStr; begin fsplit(paramstr(1), direc, fname, exten); if (fname = '') then begin fname := '*'; exten := '.*' end; result := fname + exten; end; // getFileMask var indexer:PIndexer; begin if (paramcount <> 1) then usage; randomize; new(indexer, init); indexer^.Index(getPath(), getFileMask()); dispose(indexer, done); end.