Newer
Older
ubixfs-2 / bIndex.pas
@flameshadow flameshadow on 10 Jun 2005 16 KB UbixFS
program bIndex;
uses lists, dos, strings, crt, debug;

type 
  TFileTypes = (ftUNKNOWN, ftTXT, ftBIN, ftHTML, ftPAS, ftASM, ftINC, ftH, ftI, ftC, ftCPP, ftLOG, ftFW4, ftBTI);

type 
  TFileInfo = packed record
    fileNameRecordNum : uPtr;
    wordList          : uPtr;
    timeStamp         : uPtr;
    reserved          : uPtr;
  end; // TFileInfo 

type
  pCharArrayPtr = ^pCharArray;
  pCharArray = array[0..0] of char;

type 
  tagType = (inTag, noTag);

type 
  PIndexer = ^TIndexer;
  TIndexer = object
   private
    masterBTI   : PBTree;
    fileInfoBTI : PBTree;
    fileNameDAT : file;
    fileInfoDAT : file;
    dataPath    : string;
    startingPath: string;
    curFileCount   : integer;
    foundFileCount : integer;
    procedure   deleteWordList(fileInfo:TFileInfo);
    function    fileType(const filename:string):TFileTypes;
    function    genUniqueFileName:uInt32;
    function    getTimeStamp(const filename:string):uInt32;
    procedure   parseFile(fileName:string; fileInfo:TFileInfo; ftype:TFileTypes);
    
   public
    constructor init;
    procedure   Index(path, filemask:string);
    destructor  done;
  end; // TIndex

const spinner : array[0..3] of char = '\|/-';

constructor TIndexer.init;
var 
  direc : DirStr;
  fname : NameStr;
  exten : ExtStr;
begin
  clrscr;

  masterBTI := NIL;
  fileInfoBTI := NIL;
  
  fsplit(paramstr(0), direc, fname, exten);
  if (direc = '') then
    getDir(0, startingPath)
  else
    startingPath := direc;

  dataPath := appendPathDelimiter(startingPath) + 'indicies\';
  {$I-} mkdir(dataPath); {$I+}
  IOResult();
  
  new(masterBTI, init(dataPath+'master.bti', 1024, BT_STRING, NIL));
  new(fileInfoBTI, init(dataPath+'fileInfo.bti', 1024, BT_STRING, NIL));

  assign(fileInfoDAT, dataPath+'fileInfo.dat');
  if (fsearch(dataPath+'fileInfo.dat', '') = '') then
    rewrite(fileInfoDAT)
  else
    reset(fileInfoDAT);
  
  assign(fileNameDAT, dataPath+'fileName.dat');
  if (fsearch(dataPath+'fileName.dat', '') = '') then
    rewrite(fileNameDAT)
  else
    reset(fileNameDAT);

  curFileCount := 0;
  foundFileCount := 0;        
  writeln('  Indexing');
end; // TIndexer.init

procedure TIndexer.deleteWordList;
var 
  wordBTI, wordListBTI:PBTree;
  BTSearch:BTreeSearchRec;
  u:uPtr;
begin
  // To delete the old word list, we need to 
  // walk through each entry in the wordListBTI and delete
  // the fileNameRecordNum out of the word BTI
  new(wordListBTI, open(dataPath + hex(fileInfo.wordList.u) + '.bti', NIL));
  while (wordListBTI^.GetFirstKey(BTSearch)) do
    begin     
      new(wordBTI, open(dataPath + hex(int32(BTSearch.key^)) + '.bti', NIL));
      wordBTI^.Delete(@fileInfo.fileNameRecordNum.u, u);
      dispose(wordBTI, done);
      wordListBTI^.Delete(BTSearch.key, u);      
      wordListBTI^.findClose(BTSearch);
    end;
  dispose(wordListBTI, done);
end; // TIndexer.deleteWordList

function isBinary(const fileName:string):boolean;
type
  int8          = shortInt;
  int16         = smallInt;
  int32         = longInt;

  uInt8         = byte;
  uInt16        = word;
  uInt32        = dWord;

type
  uInt8ArrayPtr = ^uInt8Array;
  uInt8Array    = array[ 0..0 ] of uInt8;

var 
  f:file;
  buf:uInt8ArrayPtr;
  i, bytesRead:integer;
  invalidCount:integer;
begin
  // assumes the file exists
  assign(f, fileName);
  {$I-} reset(f, 1); {$I+}
  if (IOResult() <> 0) then exit;

  getMem(buf, 128);
  if (buf = NIL) then 
    begin
      writeln('Out of memory');
      halt
    end;
  
  blockread(f, buf^, 128, bytesRead);
  close(f);
  invalidCount := 0;
  for i:=0 to bytesRead-1 do
    if buf^[i] in [0..9, 14..31, 166..255] then inc(invalidCount);
  freemem(buf, 128);
  result := invalidCount > 32;
end;  // isBinary

function TIndexer.fileType;
type 
  TExtensionTypes = record
    name  : string[5];
    ftype : TFileTypes;
  end; // TExtensionTypes

const   
  nameTypes : array[0..11] of TExtensionTypes = 
   ((name:'.TXT';  ftype:ftTXT),
    (name:'.HTM';  ftype:ftHTML), 
    (name:'.HTML'; ftype:ftHTML),
    (name:'.PAS';  ftype:ftPAS),
    (name:'.ASM';  ftype:ftASM),
    (name:'.INC';  ftype:ftINC),
    (name:'.C';    ftype:ftC),
    (name:'.H';    ftype:ftH),
    (name:'.I';    ftype:ftI),
    (name:'.CPP';  ftype:ftCPP),
    (name:'.CC';   ftype:ftCPP),
    (name:'.LOG';  ftype:ftLOG)
    //(name:'.FW4';  ftype:ftFW4),
    //(name:'.BTI';  ftype:ftBTI)
   ); // nameTypes
    
var 
  exten : string;
  ftype : TFileTypes;
  count : integer;
  ld    : integer;  
begin
  result := ftUNKNOWN;
 
  if (filename = '') then exit;
  // first try to determine the filetype based on extension
  ld := lastDelimiter('.', filename);
  if (ld = 0) then exit;
  exten := uppercase(copy(filename, ld, length(filename) - ld +1 ));

  ftype := ftUNKNOWN;
  for count := low(nameTypes) to high(nameTypes) do
    if (nameTypes[count].name = exten) then 
      begin
        if isBinary(filename) then exit;
        result := nameTypes[count].ftype;
        exit;
      end;
  // scan the file here to try to figure out what it is
end; // TIndexer.fileType

function TIndexer.genUniqueFileName;
var 
  num:uInt32;
  tries:uInt32;
  newFile:boolean;
begin
  tries := 0;
  newFile := FALSE;
  repeat
    num := random(maxlongint);
    newFile := fSearch(dataPath+hex(num)+'.bti', '') = '';
    inc(tries);
  until (newFile) or (tries > 1000);
  if (tries > 1000) then writeln('Should probably put a new checker here');
  result := num;
end;  // TIndexer.getUniqueFilename

function TIndexer.getTimeStamp;
var 
  dirInfo:SearchRec;
  time:longint;
begin
  result := 0;
  findFirst(filename, anyFile, dirInfo);
  if (dosError = 0) then 
    result := dirInfo.time;
  findClose(dirInfo);
end; // TIndexer.getTimeStamp

procedure TIndexer.parseFile;
var 
  s:string;
  f:file;
  p:pCharArrayPtr;
  pSize:dword;
  curPos:integer;
  startPos, endPos:integer;
  u:uPtr;
  tmpUPtr:uPtr;
  wordBTI:PBTree;
  wordListBTI:PBTree;
  memWordListBTI:PBTree;
  code:integer;
  value:int64;
  bResult:longint;
  skipWord:boolean;
  ch:char;
  tagState:tagType;

begin
  if (fsearch(filename, '') = '') then exit;
  assign(f, fileName);
  {$I-} reset(f, 1); {$I+}
  if (IOResult() <> 0) then exit;
  pSize := fileSize(f);
  if (pSize = 0) then
    begin
      close(f);
      exit
    end;
  GetMem(p, pSize);
  if (p = NIL) then exit;
  blockread(f, p^, pSize, bResult);
  if (bResult <> pSize) then writeln('Error reading in from file in TIndexer.parseFile');
  {$I-} close(f); {$I+}
  if (IOResult() <> 0) then writeln('Error closing file in TIndexer.parseFile');
  
  // update the spinner
  gotoxy(length('  Indexing  '), 1);
  write(spinner[foundFileCount and 3]);
  inc(foundFileCount);

  new(wordListBTI, init(dataPath+hex(fileInfo.wordList.u)+'.bti', 256, BT_INT32, NIL));
  new(memWordListBTI, init('', 256, BT_INT32, NIL));  // create a temp word list

  if (p^[0] = '<') then tagState := inTag else tagState := noTag;
 
  curPos := 0;
  case ftype of
    ftHTML:
      repeat
        if tagState = inTag then
          begin
           if p^[curPos] = '>' then tagState := noTag;
           p^[curPos] := ' ';
          end
        else
          begin
            if p^[curPos] = '<' then tagState := inTag;
            ch := locase(p^[curPos]);
            if (ch in ['a'..'z', '0'..'9', '''', '@', '-', '_', #128..#165]) then 
              p^[curPos] := ch
            else 
              p^[curPos] := ' ';
          end;
        inc(curPos);
      until (curPos >= pSize);
    ftTXT, ftPAS..ftLOG:
      repeat
        ch := locase(p^[curPos]);
        if (ch in ['a'..'z', '0'..'9', '''', '_', #128..#165]) then
          p^[curPos] := ch
        else
          p^[curPos] := ' ';

        inc(curPos);
      until (curPos >= pSize);  
  end; // case
  
  curPos := 0;
  startPos := 0;
  endPos := 0;
  
  if p^[0] = ' ' then tagState := noTag else tagState := inTag;
  repeat
    case tagState of
      inTag:
        begin
          if p^[curPos] = ' ' then 
            begin
              //inTag := FALSE;
              tagState := noTag;
              endPos := curPos;
              // strip off beginning and ending single quotes
              if (p^[startPos] = '''') then inc(startPos);
              if (p^[endPos-1] = '''') then dec(endPos);
              
              if (endPos - startPos > 2) and (endPos - startPos <= 64) then
                begin
                  length(s) := endPos - startPos;
                  move(p^[startPos], s[1], endPos-startPos);

                  if (s[1] in ['0'..'9']) and (s[length(s)] in ['0'..'9']) then
                    begin
                      val(s, value, code);
                      skipWord := (code = 0);
                    end
                  else skipWord := FALSE;

                  if not skipWord then
                    begin
                      if (not masterBTI^.Find(@s, u)) then 
                        begin
                          u.u := genUniqueFileName;
                          masterBTI^.insert(@s, u);
                        end;

                      // check to see if it's in the cached list
                      if (not memWordListBTI^.Find(@u, tmpUPtr)) then 
                        begin
                          new(wordBTI, init(dataPath+hex(u.u)+'.bti', 256, BT_INT32, NIL));
                          wordBTI^.insert(@fileInfo.fileNameRecordNum, u); // note that I haven't decided what goes in the value field
                          dispose(wordBTI, done);
                          wordListBTI^.Insert(@u, u);  // insert into the word list for this file
                          memWordListBTI^.Insert(@u, u); // insert into the mem word list for this file
                        end;
                    end; // !skipWord
                end;
            end;
        end;  // inTag
      noTag:
        begin
          if p^[curPos] <> ' ' then
            begin
              startPos := curPos;
              tagState := inTag;
            end;
        end;  // noTag
       else
       begin
       	writeln('tagState isn''t defined properly');
       end;
    end; {case}
    inc(curPos);
  until (curPos >= pSize);
  dispose(memWordListBTI, done);
  dispose(wordListBTI, done);
  FreeMem(p, pSize);
end; // TIndexer.parseFile

procedure TIndexer.Index;
var
  filename:string;
  fileInfo:TFileInfo;
  fileInfoRecordNum:uPtr;
  dirInfo:SearchRec;
  curDir:string;
  skipFile:boolean;
  ftype:TFileTypes;

begin
  getDir(0, curDir);
  {$I-} chdir(path); {$I+}
  if (IOResult() <> 0) then exit;

  path := appendPathDelimiter(path);
  findfirst(filemask, Archive + ReadOnly, dirInfo);
  while dosError = 0 do
    begin
      if ((dirInfo.attr and directory) = 0) then
        begin
          gotoxy(1, 1);
          write(spinner[curFileCount and 3]);
          inc(curFileCount);
          gotoxy(1,24);
          clreol;
          writeln(path+dirInfo.name);
  //        clreol;
  //        writeln('Looking in ', path + dirInfo.name);
            
          // fillchar(filename, sizeof(filename), 0);
          // expand out the filename to the full path
          filename := lowercase(fexpand(dirInfo.name));
          ftype := fileType(filename);
          if (ftype <> ftUNKNOWN) then
            begin  
              fileInfoRecordNum.offset := 0;
      
              // clear the fileInfo record
              fillchar(fileInfo, sizeof(fileInfo), 0);
      
              // if (not fileNameBTI^.find(@filename, fileNameRecordNum)) then
              if (not fileInfoBTI^.Find(@filename, fileInfoRecordNum)) then
                begin
                  // We haven't come across this file yet
          
                  // fileNameRecordNum represents the offset into the fileNameDAT file
                  // This offset is used as the key field in the file referenced by
                  // the value field in master.bti
                  fileInfo.fileNameRecordNum.offset := filesize(fileNameDAT);  
          
                  // wordList is the (numerical) filename of the file that 
                  // holds a list of words in this file.
                  fileInfo.wordList.u := genUniqueFileName();
          
                  // timeStamp holds the time of the file. This is used to
                  // determine whether we have already indexed this file.
                  // It has a resolution of 2 seconds.
                  fileInfo.timeStamp.u := getTimeStamp(filename);

                  // reserved for future usage.
                  fileInfo.reserved.offset := 0;  // cleared above as well
          
                  // write out this filename to the fileNameDAT file          
                  seek(fileNameDAT, filesize(fileNameDAT));
                  blockwrite(fileNameDAT, filename, length(filename)+1);

                  fileInfoRecordNum.offset := filesize(fileInfoDAT);
                  // insert the fileInfoRecordNum (offset into the fileInfoDAT file)
                  // into the fileInfoBTI
                  fileInfoBTI^.Insert(@filename, fileInfoRecordNum);
          
                  // write out this file's info to the fileInfoDAT file
                  seek(fileInfoDAT, fileInfoRecordNum.u);
                  blockwrite(fileInfoDAT, fileInfo, sizeof(fileInfo));
                  skipFile := FALSE;  // don't skip this file          
                end
              else
                begin
                  // The file already has been visited. 
                  seek(fileInfoDAT, fileInfoRecordNum.u);
                  blockread(fileInfoDAT, fileInfo, sizeof(fileInfo));
                  skipFile := getTimeStamp(filename) = fileInfo.timeStamp.u;
                  if not skipFile then 
                    begin
                      // delete the old word list
                      deleteWordList(fileInfo);  
                      // update the timestamp field in the fileInfo record
                      fileInfo.timeStamp.u := getTimeStamp(filename);
                      seek(fileInfoDAT, fileInfoRecordNum.u);
                      blockwrite(fileInfoDAT, fileInfo, sizeof(fileInfo));
                    end // !skipFile
                end;
              if not skipFile then parseFile(filename, fileInfo, ftype);
            end // if ftype <> ftUNKNOWN
        end;

//        if (dirInfo.name[1] <> '.') then Index(path + dirInfo.name, filemask);

      findNext(dirInfo);
    end;  

  // now recurse directories
  findFirst('*.*', Directory, dirInfo);
  while (dosError = 0) do
    begin
      if (dirInfo.name <> '.') and (dirInfo.name <> '..') then
        if (lowercase(dirInfo.name) <> 'indicies') then 
          Index(path + dirInfo.name, filemask);
      findNext(dirInfo);
    end; // while

  {$I-} chDir(curDir); {$I+}
  if (IOResult() <> 0) then writeln('Could not restore previous directory. How odd...');
end; // TIndexer.Index

destructor TIndexer.done;
begin
  if (masterBTI <> NIL) then dispose(masterBTI, done);
  if (fileInfoBTI <> NIL) then dispose(fileInfoBTI, done);

  {$I-} chdir(startingPath); {$I+}
  if (IOResult() <> 0) then writeln('Problem restoring dir');
  writeln;
  writeln('Done indexing.');
  writeln('Looked in ', curFileCount, ' files');
  writeln('Indexed ', foundFileCount, ' files');
end; // TIndexer.done

procedure usage;
begin
  writeln('Usage:');
  writeln('bIndex path');
  writeln('Examples:');
  writeln('bIndex dir\');
  writeln('bIndex dir\filename');
  writeln('bIndex dir\files*.*');
  writeln('bIndex c:\*.html');
  halt;
end; // TIndexer.usage

function getPath:string;
var
  Direc : DirStr;
  Fname : NameStr;
  Exten : ExtStr;
begin
  fsplit(paramstr(1), direc, fname, exten);
  if (direc = '') then
    getDir(0, direc);
  result := appendPathDelimiter(direc);
end; // getPath

function getFileMask:string;
var
  Direc : DirStr;
  Fname : NameStr;
  Exten : ExtStr;
begin
  fsplit(paramstr(1), direc, fname, exten);
  if (fname = '') then 
    begin
      fname := '*';
      exten := '.*'
    end;
  result := fname + exten;
end; // getFileMask

var indexer:PIndexer;

begin
  if (paramcount <> 1) then usage;
  randomize;

  new(indexer, init);
  indexer^.Index(getPath(), getFileMask());
  dispose(indexer, done);

end.