| #include <tchar.h>
#include <stdlib.h>
#include <assert.h>
#include <string.h>
#include <stdexcept>
#include <memory>
#include <string>
#include <iosfwd>
#include <iostream>
#include <fstream>
#include <vector>
#include <set>
#include "boost/config.hpp"
#include "boost/foreach.hpp"
#include "boost/function.hpp"
#include "boost/static_assert.hpp"
#include "boost/type_traits.hpp"
#include "boost/mpl/if.hpp"
#include "boost/range.hpp"
#include "boost/range/adaptors.hpp"
//////////////////////////////////////////////////////////////////////////
namespace Utils
{
//////////////////////////////////////////////////////////////////////////
template< class CharT, bool t_ignCase >
struct CharsEqual
{
bool operator ()(CharT a, CharT b) const { return a == b; }
};
template<> struct CharsEqual< char, true >
{
bool operator ()(char a, char b) const { return ::_strnicmp(&a, &b, 1) == 0; }
};
template<> struct CharsEqual< wchar_t, true >
{
bool operator ()(wchar_t a, wchar_t b) const { return ::_wcsnicmp(&a, &b, 1) == 0; }
};
//////////////////////////////////////////////////////////////////////////
namespace Aux
{
struct MatchState
{
int count;
bool skip;
};
typedef std::vector< MatchState > MatchCountersT;
//////////////////////////////////////////////////////////////////////////
template< class DiffT, bool t_Greedy >
struct SearchGreedinessImpl
{
static bool ResetCounters(MatchCountersT & matchCounters, int cTargets, int /*nTarget*/)
{
// Обнуляем счетчики совпадений всех подстрок (кто первый, тот и папа)
for (int j = 0; j < cTargets; ++j)
matchCounters[j].count = 0;
// В нежадном поиске немедленно прекращаем сопоставление
// текущего совпадения с остальными подстроками
return true;
}
};
template< class DiffT >
struct SearchGreedinessImpl< DiffT, true > : SearchGreedinessImpl< DiffT, false >
{
static bool ResetCounters(MatchCountersT & matchCounters, int /*cTargets*/, int nTarget)
{
// Обнуляем только счетчик совпадений текущей подстроки
matchCounters[nTarget].count = 0;
return false;
}
};
//////////////////////////////////////////////////////////////////////////
template< bool t_Reversed, class RangeT >
struct ReversedRangeIf
{
typedef typename boost::mpl::if_c<
t_Reversed
, boost::reversed_range< RangeT >
, boost::sub_range< RangeT >
>
::type ResultT;
};
template< bool t_Reversed, class RangeT >
inline
typename ReversedRangeIf< t_Reversed, RangeT const >::ResultT GetReversedIf(RangeT const & rng)
{
typedef typename ReversedRangeIf< t_Reversed, RangeT const >::ResultT ResultT;
return ResultT(rng);
}
} //namespace Aux {
//////////////////////////////////////////////////////////////////////////
typedef boost::function< int(int, int) > KmpTableProducerT;
template< class TargetsRangeT, bool t_Forward >
struct KmpTableProducer
{
typedef std::vector< int > IntVecT;
typedef typename TargetsRangeT::const_iterator TargetsIterT;
typedef typename std::iterator_traits< TargetsIterT > TargetsIterTraitsT;
BOOST_STATIC_ASSERT( (boost::is_base_of< std::forward_iterator_tag, typename TargetsIterTraitsT::iterator_category >::value) );
typedef typename TargetsIterTraitsT::value_type SubCharRangeInT;
typedef typename Aux::ReversedRangeIf< !t_Forward, SubCharRangeInT const >::ResultT SubCharRangeT;
typedef std::pair< SubCharRangeT, IntVecT > SubCharRange2ShiftVecT;
typedef std::vector< SubCharRange2ShiftVecT > ShiftsMapT;
typedef typename SubCharRangeT::const_iterator SubCharIterT;
typedef typename std::iterator_traits< SubCharIterT > SubCharIterTraitsT;
BOOST_STATIC_ASSERT( (boost::is_base_of< std::random_access_iterator_tag, typename SubCharIterTraitsT::iterator_category >::value) );
KmpTableProducer(TargetsRangeT const & rngTargets)
: m_shiftsMap( rngTargets.size() )
{
ShiftsMapT::const_iterator iShift = m_shiftsMap.begin();
for (TargetsIterT iTarget = boost::begin(rngTargets), iTargetEnd = boost::end(rngTargets); iTarget != iTargetEnd; ++iTarget, ++iShift) {
iShift->first = SubCharRangeT(Aux::GetReversedIf< !t_Forward >(*iTarget));
}
}
int operator ()(int nTarget, int nPos) const
{
SubCharRange2ShiftVecT & rng2shifts = m_shiftsMap[nTarget];
IntVecT & shifts = rng2shifts.second;
if (! shifts.empty())
return shifts[nPos];
SubCharRangeT const & rngText = rng2shifts.first;
shifts.resize( rngText.size() );
shifts[0] = 0;
size_t const subLen = shifts.size();
//std::cout << "{0";
for (int k = 0, i = 1; i < (int)subLen; ++i) {
while (0 < k && rngText[k] != rngText[i])
k = shifts[k - 1];
if (rngText[k] == rngText[i])
++k;
shifts[i] = k;
//std::cout << "," << k;
}
//std::cout << "}-\"" << rngText << "\"\n";
return shifts[nPos];
}
SubCharRangeT const & GetTargetRange(int const nTarget)
{
return m_shiftsMap[nTarget].first;
}
private:
ShiftsMapT mutable m_shiftsMap;
};
//////////////////////////////////////////////////////////////////////////
/*
Ищет несколько подстрок в заданном тексте
t_Forward - направление поиска: "от начала до конца текста" или наоборот
t_Greedy - жадность поиска
CharsEqualCmpT - например CharsEqual
CharRangeInT - RA-диапазон по символам
TargetsRangeT - Bidirectional-диапазон по искомым подстрокам
P.S. Вообще, поиск подстрок в тексте кроме чувствительности к регистру и "жадности" (поиск самых длинных совпадений)
должен параметризоваться и направлением поиска, что связано с "жадностью": если будем искать самые длинные, то при равенстве длин
у перекрывающихся подстрок ("макросов") отбрасывать нужно левый или правый в зависимости от направления поиска.
*/
template<
bool t_Forward
, bool t_Greedy
, class CharsEqualCmpT
, class CharRangeInT
, class TargetsRangeT
, class FindsConsumerT
>
inline size_t KmpFindManyImpl(
CharsEqualCmpT const & equal
, CharRangeInT const & rngTextIn
, TargetsRangeT const & rngTargets
, FindsConsumerT const & onFound)
{
typedef Aux::ReversedRangeIf< !t_Forward, CharRangeInT const >::ResultT CharRangeT;
typedef typename CharRangeT::const_iterator CharIterT;
typedef typename std::iterator_traits< CharIterT > CharIterTraitsT;
BOOST_STATIC_ASSERT( (boost::is_base_of< std::random_access_iterator_tag, typename CharIterTraitsT::iterator_category >::value) );
typedef typename CharIterTraitsT::value_type CharT;
typedef typename CharIterTraitsT::difference_type DiffT;
if (boost::empty(rngTextIn) || boost::empty(rngTargets))
return 0;
typedef KmpTableProducer< TargetsRangeT, t_Forward > ShiftsProducerT;
typedef typename ShiftsProducerT::SubCharRangeT SubCharRangeT;
/*typedef typename SubCharRangeT::const_iterator SubCharIterT;
BOOST_STATIC_ASSERT( (boost::is_same< CharT, typename SubCharIterT::value_type >::value) );*/
ShiftsProducerT shiftsGen(rngTargets);
CharRangeT const & rngText = Aux::GetReversedIf< !t_Forward >(rngTextIn);
DiffT const cChars = rngText.size();
CharIterT const iTextStart = rngText.begin();
CharIterT const iTextEnd = rngText.end();
CharIterT iCur = iTextStart;
DiffT const cTargets = boost::size(rngTargets);
typedef Aux::MatchState MatchStateT;
typedef Aux::MatchCountersT MatchCountersT;
typedef Aux::SearchGreedinessImpl< DiffT, t_Greedy > GreedinessT;
MatchCountersT matchCounters(cTargets, MatchStateT());
size_t cFinds = 0;
int cTooLongTargets = 0;
bool needContinue = true;
while (needContinue) {
DiffT const nTextPos = iCur - iTextStart;
for (int i = 0; i < cTargets; ++i) {
MatchStateT & match = matchCounters[i];
if (match.skip)
continue;
// Если макрос длиннее текста, в котором ищем, или остаток макроса длиннее остатка текста,
// то пропускаем и увеличиваем счетчик слишком длинных (при большом количестве длинных подстрок в конце ускоримся)
//SubCharRangeT const & target = Aux::GetReversedIf< !t_Forward >( rngTargets[i] );
SubCharRangeT const & target = shiftsGen.GetTargetRange(i);
DiffT const cTargetChars = target.size();
DiffT const cTargetTail = cTargetChars - match.count - 1;
if (cChars < cTargetChars || cChars - nTextPos < cTargetTail) {
match.skip = true;
++cTooLongTargets;
// Если все макросы оказались длиннее текста, то выходим из внутреннего, а потом и из внешнего циклов
if (cTargets == cTooLongTargets) {
needContinue = false;
break;
}
continue;
}
DiffT rest = match.count;
while (0 < rest && !equal(*iCur, target[match.count])) {
rest = shiftsGen(i, match.count - 1);
match.count = rest;
}
if (equal(*iCur, target[match.count]))
++match.count;
if (match.count == cTargetChars) {
DiffT const len = match.count;
++cFinds;
needContinue = onFound(nTextPos - len + 1, len, i);
// Выходим из внутреннего цикла, если поиск нежадный
if (GreedinessT::ResetCounters(matchCounters, cTargets, i))
break;
}
}
if (iTextEnd <= iCur++)
break;
}
return cFinds;
}
} //namespace Utils {
//////////////////////////////////////////////////////////////////////////
typedef char Char;
typedef std::basic_string< Char > String;
Utils::CharsEqual< Char, false > const g_CmpChar = Utils::CharsEqual< Char, false >();
//////////////////////////////////////////////////////////////////////////
struct SearchBase
{
struct Result
{
Result(int p = 0, int l = 0) : pos(p), len(l) {}
bool operator <(Result const & b) const { return pos < b.pos || (pos == b.pos && b.len < len); }
bool operator >(Result const & b) const { return b.pos < pos || (pos == b.pos && b.len < len); }
bool Intersect(Result const & b) const { return (pos <= b.pos && b.pos < pos + len) || (b.pos <= pos && pos < b.pos + b.len); }
int const pos, len;
};
template< bool t_Forward >
struct ResultsOrder
{
typedef std::greater< Result > ByPosition;
struct ByLength
{
bool operator ()(Result const & a, Result const & b) const
{
return b.len < a.len || (a.len == b.len && b.pos < a.pos);
}
};
};
};
template<>
struct SearchBase::ResultsOrder< true >
{
typedef std::less< Result > ByPosition;
struct ByLength
{
bool operator ()(Result const & a, Result const & b) const
{
return b.len < a.len || (a.len == b.len && a.pos < b.pos);
}
};
};
template< bool t_SearchFwd = true, bool t_ResultsFwd = true, bool t_Greedy = false >
struct Search : public SearchBase
{
static bool const SearchFwd = t_SearchFwd;
static bool const ResultsFwd = t_ResultsFwd;
typedef Search< SearchFwd, ResultsFwd, t_Greedy > ThisT;
typedef ResultsOrder< ResultsFwd > ResultsOrderT;
typedef std::set< Result, typename ResultsOrderT::ByPosition > ResultsPosSet;
typedef std::set< Result, typename ResultsOrderT::ByLength > ResultsLenSet;
typedef typename ResultsLenSet::const_iterator ResultsLenIter;
template< class CharRangeT, class TargetsRangeT >
static std::auto_ptr< ThisT > FindDistinct(CharRangeT const & rngText, TargetsRangeT const & rngTargets)
{
std::auto_ptr< ThisT > spSearch = FindAll(rngText, rngTargets);
FilterResults(*spSearch);
return spSearch;
}
template< class CharRangeT, class TargetsRangeT >
static std::auto_ptr< ThisT > FindAll(CharRangeT const & rngText, TargetsRangeT const & rngTargets)
{
std::auto_ptr< ThisT > spSearch( new ThisT() );
Utils::KmpFindManyImpl< t_SearchFwd, t_Greedy >(g_CmpChar, rngText, rngTargets, *spSearch);
return spSearch;
}
bool operator ()(int pos, int len, int /*nTarget*/) const
{
Result const match(pos, len);
m_results.insert(match);
m_longest.insert(match);
return true;
}
ResultsPosSet const & Results() const { return m_results; }
private:
Search() {}
template< class IterT >
static IterT GetNextRes(IterT const iRes)
{
IterT iNext = iRes;
return ++iNext;
}
static void FilterResults(ThisT & search)
{
ResultsLenIter const iEnd = search.m_longest.end();
for (ResultsLenIter iRes = search.m_longest.begin(); iRes != iEnd; ++iRes) {
for (ResultsLenIter iNext = GetNextRes(iRes); iNext != iEnd;) {
if (iRes->Intersect(*iNext)) {
search.m_results.erase(*iNext);
search.m_longest.erase(*iNext++);
}
else {
++iNext;
}
}
}
}
private:
ResultsPosSet mutable m_results;
ResultsLenSet mutable m_longest;
};
//////////////////////////////////////////////////////////////////////////
template< class SearchT, class CharRangeInT, class TargetsRangeT >
void TestPrint(CharRangeInT const & rngTextIn, TargetsRangeT const & rngTargets)
{
std::auto_ptr< SearchT > const search = SearchT::FindDistinct(rngTextIn, rngTargets);
//std::auto_ptr< SearchT > const search = SearchT::FindAll(rngTextIn, rngTargets);
enum { IsReversed = !SearchT::SearchFwd };
typedef Utils::Aux::ReversedRangeIf< IsReversed, CharRangeInT const >::ResultT CharRangeT;
CharRangeT const & rngText = Utils::Aux::GetReversedIf< !SearchT::SearchFwd >(rngTextIn);
::printf("%s:%u\n", IsReversed ? "Reverse" : "Forward", search->Results().size());
std::cout << "[" << rngText << "]:\n";
BOOST_FOREACH(SearchBase::Result const & res, search->Results()) {
typename CharRangeT::const_iterator const iSub = rngText.begin() + res.pos;
//::printf("\t{%d-%d}'%s'\n", (1 - IsReversed)*res.pos + IsReversed*(rngText.size() - res.pos - res.len), res.len, String(iSub, iSub + res.len).c_str());
::printf("\t{%d-%d}'%s'\n", res.pos, res.len, String(iSub, iSub + res.len).c_str());
}
}
//////////////////////////////////////////////////////////////////////////
int main(int const argc, char * const argv[])
{
if (argc < 3) {
printf("Search substrings in file: <program> <filepath> <str1>[, <str2>,...]");
return EXIT_FAILURE;
}
using namespace std;
try {
// Текст
ifstream ifs(argv[1], ios::in | ios::binary);
if (! ifs.is_open())
throw std::runtime_error("FindManySample.in not found!");
size_t const size = ifs.seekg(0, ios::end).tellg();
ifs.seekg(0, ios::beg);
std::vector< char > text(size + 1, 0);
ifs.read(&text[0], size);
Char const * const pText = &text[0];
int const cChars = size;
// Набор подстрок для поиска
typedef std::vector< String > TargetsT;
TargetsT targets;
{
std::set< String > targetsRaw;
for (int i = 2; i < argc; ++i)
targetsRaw.insert( String(argv[i]) );
targets.reserve( targetsRaw.size() );
BOOST_FOREACH( String const & sub, targetsRaw ) { targets.push_back(sub); }
}
// Текст, в котором производится поиск
typedef boost::iterator_range< char const * > CharRangeT;
CharRangeT const rngText = boost::make_iterator_range(pText, pText + cChars);
// Поиск "вперёд"
TestPrint< Search< true, true, true > >(rngText, targets);
// Поиск "назад"
TestPrint< Search< false, true, true > >(rngText, targets);
return EXIT_SUCCESS;
}
catch (exception & e) {
printf("Error: %s\n", e.what());
}
catch (...) {
printf("Unknown error\n");
}
return EXIT_FAILURE;
}
|