program bIndex;
uses lists, dos, strings, crt, debug;
type
TFileTypes = (ftUNKNOWN, ftTXT, ftBIN, ftHTML, ftPAS, ftASM, ftINC, ftH, ftI, ftC, ftCPP, ftLOG, ftFW4, ftBTI);
type
TFileInfo = packed record
fileNameRecordNum : uPtr;
wordList : uPtr;
timeStamp : uPtr;
reserved : uPtr;
end; // TFileInfo
type
pCharArrayPtr = ^pCharArray;
pCharArray = array[0..0] of char;
type
tagType = (inTag, noTag);
type
PIndexer = ^TIndexer;
TIndexer = object
private
masterBTI : PBTree;
fileInfoBTI : PBTree;
fileNameDAT : file;
fileInfoDAT : file;
dataPath : string;
startingPath: string;
curFileCount : integer;
foundFileCount : integer;
procedure deleteWordList(fileInfo:TFileInfo);
function fileType(const filename:string):TFileTypes;
function genUniqueFileName:uInt32;
function getTimeStamp(const filename:string):uInt32;
procedure parseFile(fileName:string; fileInfo:TFileInfo; ftype:TFileTypes);
public
constructor init;
procedure Index(path, filemask:string);
destructor done;
end; // TIndex
const spinner : array[0..3] of char = '\|/-';
constructor TIndexer.init;
var
direc : DirStr;
fname : NameStr;
exten : ExtStr;
begin
clrscr;
masterBTI := NIL;
fileInfoBTI := NIL;
fsplit(paramstr(0), direc, fname, exten);
if (direc = '') then
getDir(0, startingPath)
else
startingPath := direc;
dataPath := appendPathDelimiter(startingPath) + 'indicies\';
{$I-} mkdir(dataPath); {$I+}
IOResult();
new(masterBTI, init(dataPath+'master.bti', 1024, BT_STRING, NIL));
new(fileInfoBTI, init(dataPath+'fileInfo.bti', 1024, BT_STRING, NIL));
assign(fileInfoDAT, dataPath+'fileInfo.dat');
if (fsearch(dataPath+'fileInfo.dat', '') = '') then
rewrite(fileInfoDAT)
else
reset(fileInfoDAT);
assign(fileNameDAT, dataPath+'fileName.dat');
if (fsearch(dataPath+'fileName.dat', '') = '') then
rewrite(fileNameDAT)
else
reset(fileNameDAT);
curFileCount := 0;
foundFileCount := 0;
writeln(' Indexing');
end; // TIndexer.init
procedure TIndexer.deleteWordList;
var
wordBTI, wordListBTI:PBTree;
BTSearch:BTreeSearchRec;
u:uPtr;
begin
// To delete the old word list, we need to
// walk through each entry in the wordListBTI and delete
// the fileNameRecordNum out of the word BTI
new(wordListBTI, open(dataPath + hex(fileInfo.wordList.u) + '.bti', NIL));
while (wordListBTI^.GetFirstKey(BTSearch)) do
begin
new(wordBTI, open(dataPath + hex(int32(BTSearch.key^)) + '.bti', NIL));
wordBTI^.Delete(@fileInfo.fileNameRecordNum.u, u);
dispose(wordBTI, done);
wordListBTI^.Delete(BTSearch.key, u);
wordListBTI^.findClose(BTSearch);
end;
dispose(wordListBTI, done);
end; // TIndexer.deleteWordList
function isBinary(const fileName:string):boolean;
type
int8 = shortInt;
int16 = smallInt;
int32 = longInt;
uInt8 = byte;
uInt16 = word;
uInt32 = dWord;
type
uInt8ArrayPtr = ^uInt8Array;
uInt8Array = array[ 0..0 ] of uInt8;
var
f:file;
buf:uInt8ArrayPtr;
i, bytesRead:integer;
invalidCount:integer;
begin
// assumes the file exists
assign(f, fileName);
{$I-} reset(f, 1); {$I+}
if (IOResult() <> 0) then exit;
getMem(buf, 128);
if (buf = NIL) then
begin
writeln('Out of memory');
halt
end;
blockread(f, buf^, 128, bytesRead);
close(f);
invalidCount := 0;
for i:=0 to bytesRead-1 do
if buf^[i] in [0..9, 14..31, 166..255] then inc(invalidCount);
freemem(buf, 128);
result := invalidCount > 32;
end; // isBinary
function TIndexer.fileType;
type
TExtensionTypes = record
name : string[5];
ftype : TFileTypes;
end; // TExtensionTypes
const
nameTypes : array[0..11] of TExtensionTypes =
((name:'.TXT'; ftype:ftTXT),
(name:'.HTM'; ftype:ftHTML),
(name:'.HTML'; ftype:ftHTML),
(name:'.PAS'; ftype:ftPAS),
(name:'.ASM'; ftype:ftASM),
(name:'.INC'; ftype:ftINC),
(name:'.C'; ftype:ftC),
(name:'.H'; ftype:ftH),
(name:'.I'; ftype:ftI),
(name:'.CPP'; ftype:ftCPP),
(name:'.CC'; ftype:ftCPP),
(name:'.LOG'; ftype:ftLOG)
//(name:'.FW4'; ftype:ftFW4),
//(name:'.BTI'; ftype:ftBTI)
); // nameTypes
var
exten : string;
ftype : TFileTypes;
count : integer;
ld : integer;
begin
result := ftUNKNOWN;
if (filename = '') then exit;
// first try to determine the filetype based on extension
ld := lastDelimiter('.', filename);
if (ld = 0) then exit;
exten := uppercase(copy(filename, ld, length(filename) - ld +1 ));
ftype := ftUNKNOWN;
for count := low(nameTypes) to high(nameTypes) do
if (nameTypes[count].name = exten) then
begin
if isBinary(filename) then exit;
result := nameTypes[count].ftype;
exit;
end;
// scan the file here to try to figure out what it is
end; // TIndexer.fileType
function TIndexer.genUniqueFileName;
var
num:uInt32;
tries:uInt32;
newFile:boolean;
begin
tries := 0;
newFile := FALSE;
repeat
num := random(maxlongint);
newFile := fSearch(dataPath+hex(num)+'.bti', '') = '';
inc(tries);
until (newFile) or (tries > 1000);
if (tries > 1000) then writeln('Should probably put a new checker here');
result := num;
end; // TIndexer.getUniqueFilename
function TIndexer.getTimeStamp;
var
dirInfo:SearchRec;
time:longint;
begin
result := 0;
findFirst(filename, anyFile, dirInfo);
if (dosError = 0) then
result := dirInfo.time;
findClose(dirInfo);
end; // TIndexer.getTimeStamp
procedure TIndexer.parseFile;
var
s:string;
f:file;
p:pCharArrayPtr;
pSize:dword;
curPos:integer;
startPos, endPos:integer;
u:uPtr;
tmpUPtr:uPtr;
wordBTI:PBTree;
wordListBTI:PBTree;
memWordListBTI:PBTree;
code:integer;
value:int64;
bResult:longint;
skipWord:boolean;
ch:char;
tagState:tagType;
begin
if (fsearch(filename, '') = '') then exit;
assign(f, fileName);
{$I-} reset(f, 1); {$I+}
if (IOResult() <> 0) then exit;
pSize := fileSize(f);
if (pSize = 0) then
begin
close(f);
exit
end;
GetMem(p, pSize);
if (p = NIL) then exit;
blockread(f, p^, pSize, bResult);
if (bResult <> pSize) then writeln('Error reading in from file in TIndexer.parseFile');
{$I-} close(f); {$I+}
if (IOResult() <> 0) then writeln('Error closing file in TIndexer.parseFile');
// update the spinner
gotoxy(length(' Indexing '), 1);
write(spinner[foundFileCount and 3]);
inc(foundFileCount);
new(wordListBTI, init(dataPath+hex(fileInfo.wordList.u)+'.bti', 256, BT_INT32, NIL));
new(memWordListBTI, init('', 256, BT_INT32, NIL)); // create a temp word list
if (p^[0] = '<') then tagState := inTag else tagState := noTag;
curPos := 0;
case ftype of
ftHTML:
repeat
if tagState = inTag then
begin
if p^[curPos] = '>' then tagState := noTag;
p^[curPos] := ' ';
end
else
begin
if p^[curPos] = '<' then tagState := inTag;
ch := locase(p^[curPos]);
if (ch in ['a'..'z', '0'..'9', '''', '@', '-', '_', #128..#165]) then
p^[curPos] := ch
else
p^[curPos] := ' ';
end;
inc(curPos);
until (curPos >= pSize);
ftTXT, ftPAS..ftLOG:
repeat
ch := locase(p^[curPos]);
if (ch in ['a'..'z', '0'..'9', '''', '_', #128..#165]) then
p^[curPos] := ch
else
p^[curPos] := ' ';
inc(curPos);
until (curPos >= pSize);
end; // case
curPos := 0;
startPos := 0;
endPos := 0;
if p^[0] = ' ' then tagState := noTag else tagState := inTag;
repeat
case tagState of
inTag:
begin
if p^[curPos] = ' ' then
begin
//inTag := FALSE;
tagState := noTag;
endPos := curPos;
// strip off beginning and ending single quotes
if (p^[startPos] = '''') then inc(startPos);
if (p^[endPos-1] = '''') then dec(endPos);
if (endPos - startPos > 2) and (endPos - startPos <= 64) then
begin
length(s) := endPos - startPos;
move(p^[startPos], s[1], endPos-startPos);
if (s[1] in ['0'..'9']) and (s[length(s)] in ['0'..'9']) then
begin
val(s, value, code);
skipWord := (code = 0);
end
else skipWord := FALSE;
if not skipWord then
begin
if (not masterBTI^.Find(@s, u)) then
begin
u.u := genUniqueFileName;
masterBTI^.insert(@s, u);
end;
// check to see if it's in the cached list
if (not memWordListBTI^.Find(@u, tmpUPtr)) then
begin
new(wordBTI, init(dataPath+hex(u.u)+'.bti', 256, BT_INT32, NIL));
wordBTI^.insert(@fileInfo.fileNameRecordNum, u); // note that I haven't decided what goes in the value field
dispose(wordBTI, done);
wordListBTI^.Insert(@u, u); // insert into the word list for this file
memWordListBTI^.Insert(@u, u); // insert into the mem word list for this file
end;
end; // !skipWord
end;
end;
end; // inTag
noTag:
begin
if p^[curPos] <> ' ' then
begin
startPos := curPos;
tagState := inTag;
end;
end; // noTag
else
begin
writeln('tagState isn''t defined properly');
end;
end; {case}
inc(curPos);
until (curPos >= pSize);
dispose(memWordListBTI, done);
dispose(wordListBTI, done);
FreeMem(p, pSize);
end; // TIndexer.parseFile
procedure TIndexer.Index;
var
filename:string;
fileInfo:TFileInfo;
fileInfoRecordNum:uPtr;
dirInfo:SearchRec;
curDir:string;
skipFile:boolean;
ftype:TFileTypes;
begin
getDir(0, curDir);
{$I-} chdir(path); {$I+}
if (IOResult() <> 0) then exit;
path := appendPathDelimiter(path);
findfirst(filemask, Archive + ReadOnly, dirInfo);
while dosError = 0 do
begin
if ((dirInfo.attr and directory) = 0) then
begin
gotoxy(1, 1);
write(spinner[curFileCount and 3]);
inc(curFileCount);
gotoxy(1,24);
clreol;
writeln(path+dirInfo.name);
// clreol;
// writeln('Looking in ', path + dirInfo.name);
// fillchar(filename, sizeof(filename), 0);
// expand out the filename to the full path
filename := lowercase(fexpand(dirInfo.name));
ftype := fileType(filename);
if (ftype <> ftUNKNOWN) then
begin
fileInfoRecordNum.offset := 0;
// clear the fileInfo record
fillchar(fileInfo, sizeof(fileInfo), 0);
// if (not fileNameBTI^.find(@filename, fileNameRecordNum)) then
if (not fileInfoBTI^.Find(@filename, fileInfoRecordNum)) then
begin
// We haven't come across this file yet
// fileNameRecordNum represents the offset into the fileNameDAT file
// This offset is used as the key field in the file referenced by
// the value field in master.bti
fileInfo.fileNameRecordNum.offset := filesize(fileNameDAT);
// wordList is the (numerical) filename of the file that
// holds a list of words in this file.
fileInfo.wordList.u := genUniqueFileName();
// timeStamp holds the time of the file. This is used to
// determine whether we have already indexed this file.
// It has a resolution of 2 seconds.
fileInfo.timeStamp.u := getTimeStamp(filename);
// reserved for future usage.
fileInfo.reserved.offset := 0; // cleared above as well
// write out this filename to the fileNameDAT file
seek(fileNameDAT, filesize(fileNameDAT));
blockwrite(fileNameDAT, filename, length(filename)+1);
fileInfoRecordNum.offset := filesize(fileInfoDAT);
// insert the fileInfoRecordNum (offset into the fileInfoDAT file)
// into the fileInfoBTI
fileInfoBTI^.Insert(@filename, fileInfoRecordNum);
// write out this file's info to the fileInfoDAT file
seek(fileInfoDAT, fileInfoRecordNum.u);
blockwrite(fileInfoDAT, fileInfo, sizeof(fileInfo));
skipFile := FALSE; // don't skip this file
end
else
begin
// The file already has been visited.
seek(fileInfoDAT, fileInfoRecordNum.u);
blockread(fileInfoDAT, fileInfo, sizeof(fileInfo));
skipFile := getTimeStamp(filename) = fileInfo.timeStamp.u;
if not skipFile then
begin
// delete the old word list
deleteWordList(fileInfo);
// update the timestamp field in the fileInfo record
fileInfo.timeStamp.u := getTimeStamp(filename);
seek(fileInfoDAT, fileInfoRecordNum.u);
blockwrite(fileInfoDAT, fileInfo, sizeof(fileInfo));
end // !skipFile
end;
if not skipFile then parseFile(filename, fileInfo, ftype);
end // if ftype <> ftUNKNOWN
end;
// if (dirInfo.name[1] <> '.') then Index(path + dirInfo.name, filemask);
findNext(dirInfo);
end;
// now recurse directories
findFirst('*.*', Directory, dirInfo);
while (dosError = 0) do
begin
if (dirInfo.name <> '.') and (dirInfo.name <> '..') then
if (lowercase(dirInfo.name) <> 'indicies') then
Index(path + dirInfo.name, filemask);
findNext(dirInfo);
end; // while
{$I-} chDir(curDir); {$I+}
if (IOResult() <> 0) then writeln('Could not restore previous directory. How odd...');
end; // TIndexer.Index
destructor TIndexer.done;
begin
if (masterBTI <> NIL) then dispose(masterBTI, done);
if (fileInfoBTI <> NIL) then dispose(fileInfoBTI, done);
{$I-} chdir(startingPath); {$I+}
if (IOResult() <> 0) then writeln('Problem restoring dir');
writeln;
writeln('Done indexing.');
writeln('Looked in ', curFileCount, ' files');
writeln('Indexed ', foundFileCount, ' files');
end; // TIndexer.done
procedure usage;
begin
writeln('Usage:');
writeln('bIndex path');
writeln('Examples:');
writeln('bIndex dir\');
writeln('bIndex dir\filename');
writeln('bIndex dir\files*.*');
writeln('bIndex c:\*.html');
halt;
end; // TIndexer.usage
function getPath:string;
var
Direc : DirStr;
Fname : NameStr;
Exten : ExtStr;
begin
fsplit(paramstr(1), direc, fname, exten);
if (direc = '') then
getDir(0, direc);
result := appendPathDelimiter(direc);
end; // getPath
function getFileMask:string;
var
Direc : DirStr;
Fname : NameStr;
Exten : ExtStr;
begin
fsplit(paramstr(1), direc, fname, exten);
if (fname = '') then
begin
fname := '*';
exten := '.*'
end;
result := fname + exten;
end; // getFileMask
var indexer:PIndexer;
begin
if (paramcount <> 1) then usage;
randomize;
new(indexer, init);
indexer^.Index(getPath(), getFileMask());
dispose(indexer, done);
end.