/*========================================================================== * Copyright (c) 2002-2011 University of Massachusetts. All Rights Reserved. * * Use of the Lemur Toolkit for Language Modeling and Information Retrieval * is subject to the terms of the software license set forth in the LICENSE * file included with this software, and also available at * http://www.lemurproject.org/license.html * *========================================================================== */ /* * < Calculation of Field-level Collection Statistics > * author : jykim@cs.umass.edu * */ #include "lemur/common_headers.hpp" #include "indri/Repository.hpp" #include "indri/CompressedCollection.hpp" #include "indri/LocalQueryServer.hpp" #include "indri/ScopedLock.hpp" #include "indri/QueryEnvironment.hpp" #include "indri/Index.hpp" #include using namespace std; // Fetch the list of words from the collection void getVocabulary(indri::collection::Repository& r,map& wordList,int& documentCount); // void getFieldLM(indri::collection::Repository& r,string fieldName,ofstream& output,map& wordList,int documentCount);; int main(int argc, char *argv[]) { indri::collection::Repository r; string repName=argv[1]; string fileName = argv[2]; //string r.openRead(repName); cout<<"Repository open"< wordList; int documentCount; getVocabulary(r,wordList,documentCount); /* ofstream output_wordlist("wordList.txt"); for(map::iterator iter=wordList.begin();iter!=wordList.end();iter++){ output_wordlist<first<<" "<second< fields = local.fieldList(); for( size_t i=0; i& wordList,int& documentCount){ indri::collection::Repository::index_state state = r.indexes(); indri::index::Index* index = (*state)[0]; indri::index::VocabularyIterator* iter = index->vocabularyIterator(); iter->startIteration(); std::cout << "UniqueTermCount: " << index->uniqueTermCount() << "DocumentCount: " << index->documentCount() << std::endl; documentCount=index->documentCount(); int id=1; while( !iter->finished() ) { indri::index::DiskTermData* entry = iter->currentEntry(); indri::index::TermData* termData = entry->termData; if(wordList.count(termData->term)!=0){ cerr<<"term has existed!!"<term]=id; id++; iter->nextEntry(); } cout<<"id:"<& wordList,int documentCount){ map fieldLM; for(map::iterator iter=wordList.begin();iter!=wordList.end();iter++){ fieldLM[iter->first] = 0; } double sum=0; indri::server::LocalQueryServer local(r); std::vector documentIDs; for(int i=1;i<=documentCount;i++) documentIDs.push_back(i); indri::server::QueryServerVectorsResponse* response = local.documentVectors( documentIDs ); for(int i=0;igetResults().size();i++){ indri::api::DocumentVector* docVector = response->getResults()[i]; for( size_t j=0; jfields().size(); j++ ) { const indri::api::DocumentVector::Field& field = docVector->fields()[j]; if(field.name==fieldName){ int begin=field.begin; int end=field.end; for(int k=begin;kpositions()[k]; string stem=docVector->stems()[stem_pos]; if(stem=="[OOV]") continue; if(wordList.count(stem)==0){ cerr<<"stem:"<::iterator iter=fieldLM.begin();iter!=fieldLM.end();iter++){ if(iter->second > 0) output<first<<" "<second<<"\t";///sum } output<