diff options
author | David Kalnischkies <david@kalnischkies.de> | 2014-05-10 11:24:44 +0200 |
---|---|---|
committer | David Kalnischkies <david@kalnischkies.de> | 2014-05-10 11:24:44 +0200 |
commit | 8710a36a01c0cb1648926792c2ad05185535558e (patch) | |
tree | 0f4bc04b87ae5926d9f855f7268ee84802232749 | |
parent | ffe3c68e494efc1c2c4d748fa9d69e150867e5c2 (diff) |
improve pkgTagSection scanning and parsing
Removes the 256 fields limit, deals consistently with spaces littered
all over the place and is even a tiny bit faster than before.
Even comes with a bunch of new tests to validate these claims.
-rw-r--r-- | apt-pkg/tagfile.cc | 209 | ||||
-rw-r--r-- | apt-pkg/tagfile.h | 63 | ||||
-rw-r--r-- | cmdline/apt-extracttemplates.cc | 13 | ||||
-rwxr-xr-x | test/integration/test-apt-ftparchive-src-cachedb | 5 | ||||
-rw-r--r-- | test/libapt/tagfile_test.cc | 179 |
5 files changed, 365 insertions, 104 deletions
diff --git a/apt-pkg/tagfile.cc b/apt-pkg/tagfile.cc index 009ed7d74..52f4da2d5 100644 --- a/apt-pkg/tagfile.cc +++ b/apt-pkg/tagfile.cc @@ -47,6 +47,22 @@ public: unsigned long long Size; }; +static unsigned long AlphaHash(const char *Text, size_t Length) /*{{{*/ +{ + /* This very simple hash function for the last 8 letters gives + very good performance on the debian package files */ + if (Length > 8) + { + Text += (Length - 8); + Length = 8; + } + unsigned long Res = 0; + for (size_t i = 0; i < Length; ++i) + Res = ((unsigned long)(Text[i]) & 0xDF) ^ (Res << 1); + return Res & 0xFF; +} + /*}}}*/ + // TagFile::pkgTagFile - Constructor /*{{{*/ // --------------------------------------------------------------------- /* */ @@ -139,18 +155,23 @@ bool pkgTagFile::Resize(unsigned long long const newSize) */ bool pkgTagFile::Step(pkgTagSection &Tag) { - while (Tag.Scan(d->Start,d->End - d->Start) == false) + if(Tag.Scan(d->Start,d->End - d->Start) == false) { - if (Fill() == false) - return false; - - if(Tag.Scan(d->Start,d->End - d->Start)) - break; + do + { + if (Fill() == false) + return false; + + if(Tag.Scan(d->Start,d->End - d->Start, false)) + break; + + if (Resize() == false) + return _error->Error(_("Unable to parse package file %s (1)"), + d->Fd.Name().c_str()); - if (Resize() == false) - return _error->Error(_("Unable to parse package file %s (1)"), - d->Fd.Name().c_str()); + } while (Tag.Scan(d->Start,d->End - d->Start, false) == false); } + d->Start += Tag.size(); d->iOffset += Tag.size(); @@ -244,7 +265,7 @@ bool pkgTagFile::Jump(pkgTagSection &Tag,unsigned long long Offset) if (Fill() == false) return false; - if (Tag.Scan(d->Start, d->End - d->Start) == false) + if (Tag.Scan(d->Start, d->End - d->Start, false) == false) return _error->Error(_("Unable to parse package file %s (2)"),d->Fd.Name().c_str()); return true; @@ -254,27 +275,46 @@ bool pkgTagFile::Jump(pkgTagSection &Tag,unsigned long long Offset) // --------------------------------------------------------------------- /* */ pkgTagSection::pkgTagSection() - : Section(0), TagCount(0), d(NULL), Stop(0) + : Section(0), d(NULL), Stop(0) { - memset(&Indexes, 0, sizeof(Indexes)); - memset(&AlphaIndexes, 0, sizeof(AlphaIndexes)); + memset(&LookupTable, 0, sizeof(LookupTable)); } /*}}}*/ // TagSection::Scan - Scan for the end of the header information /*{{{*/ -// --------------------------------------------------------------------- -/* This looks for the first double new line in the data stream. - It also indexes the tags in the section. */ -bool pkgTagSection::Scan(const char *Start,unsigned long MaxLength) +bool pkgTagSection::Scan(const char *Start,unsigned long MaxLength, bool const Restart) { + Section = Start; const char *End = Start + MaxLength; - Stop = Section = Start; - memset(AlphaIndexes,0,sizeof(AlphaIndexes)); + + if (Restart == false && Tags.empty() == false) + { + Stop = Section + Tags.back().StartTag; + if (End <= Stop) + return false; + Stop = (const char *)memchr(Stop,'\n',End - Stop); + if (Stop == NULL) + return false; + ++Stop; + } + else + { + Stop = Section; + if (Tags.empty() == false) + { + memset(&LookupTable, 0, sizeof(LookupTable)); + Tags.clear(); + } + Tags.reserve(0x100); + } + size_t TagCount = Tags.size(); if (Stop == 0) return false; - TagCount = 0; - while (TagCount+1 < sizeof(Indexes)/sizeof(Indexes[0]) && Stop < End) + TagData lastTagData(0); + lastTagData.EndTag = 0; + unsigned long lastTagHash = 0; + while (Stop < End) { TrimRecord(true,End); @@ -286,12 +326,39 @@ bool pkgTagSection::Scan(const char *Start,unsigned long MaxLength) // Start a new index and add it to the hash if (isspace(Stop[0]) == 0) { - Indexes[TagCount++] = Stop - Section; - AlphaIndexes[AlphaHash(Stop,End)] = TagCount; + // store the last found tag + if (lastTagData.EndTag != 0) + { + if (LookupTable[lastTagHash] != 0) + lastTagData.NextInBucket = LookupTable[lastTagHash]; + LookupTable[lastTagHash] = TagCount; + Tags.push_back(lastTagData); + } + + ++TagCount; + lastTagData = TagData(Stop - Section); + // find the colon separating tag and value + char const * Colon = (char const *) memchr(Stop, ':', End - Stop); + if (Colon == NULL) + return false; + // find the end of the tag (which might or might not be the colon) + char const * EndTag = Colon; + --EndTag; + for (; EndTag > Stop && isspace(*EndTag) != 0; --EndTag) + ; + ++EndTag; + lastTagData.EndTag = EndTag - Section; + lastTagHash = AlphaHash(Stop, EndTag - Stop); + // find the beginning of the value + Stop = Colon + 1; + for (; isspace(*Stop) != 0; ++Stop); + if (Stop >= End) + return false; + lastTagData.StartValue = Stop - Section; } Stop = (const char *)memchr(Stop,'\n',End - Stop); - + if (Stop == 0) return false; @@ -302,7 +369,16 @@ bool pkgTagSection::Scan(const char *Start,unsigned long MaxLength) // Double newline marks the end of the record if (Stop+1 < End && Stop[1] == '\n') { - Indexes[TagCount] = Stop - Section; + if (lastTagData.EndTag != 0) + { + if (LookupTable[lastTagHash] != 0) + lastTagData.NextInBucket = LookupTable[lastTagHash]; + LookupTable[lastTagHash] = TagCount; + Tags.push_back(lastTagData); + } + + TagData const td(Stop - Section); + Tags.push_back(td); TrimRecord(false,End); return true; } @@ -331,8 +407,8 @@ void pkgTagSection::Trim() for (; Stop > Section + 2 && (Stop[-2] == '\n' || Stop[-2] == '\r'); Stop--); } /*}}}*/ -// TagSection::Exists - return True if a tag exists /*{{{*/ -bool pkgTagSection::Exists(const char* const Tag) +// TagSection::Exists - return True if a tag exists /*{{{*/ +bool pkgTagSection::Exists(const char* const Tag) const { unsigned int tmp; return Find(Tag, tmp); @@ -343,73 +419,43 @@ bool pkgTagSection::Exists(const char* const Tag) /* This searches the section for a tag that matches the given string. */ bool pkgTagSection::Find(const char *Tag,unsigned int &Pos) const { - unsigned int Length = strlen(Tag); - unsigned int I = AlphaIndexes[AlphaHash(Tag)]; - if (I == 0) + size_t const Length = strlen(Tag); + unsigned int Bucket = LookupTable[AlphaHash(Tag, Length)]; + if (Bucket == 0) return false; - I--; - - for (unsigned int Counter = 0; Counter != TagCount; Counter++, - I = (I+1)%TagCount) + + for (; Bucket != 0; Bucket = Tags[Bucket - 1].NextInBucket) { - const char *St; - St = Section + Indexes[I]; - if (strncasecmp(Tag,St,Length) != 0) + if ((Tags[Bucket - 1].EndTag - Tags[Bucket - 1].StartTag) != Length) continue; - // Make sure the colon is in the right place - const char *C = St + Length; - for (; isspace(*C) != 0; C++); - if (*C != ':') + char const * const St = Section + Tags[Bucket - 1].StartTag; + if (strncasecmp(Tag,St,Length) != 0) continue; - Pos = I; + + Pos = Bucket - 1; return true; } Pos = 0; return false; } - /*}}}*/ -// TagSection::Find - Locate a tag /*{{{*/ -// --------------------------------------------------------------------- -/* This searches the section for a tag that matches the given string. */ bool pkgTagSection::Find(const char *Tag,const char *&Start, const char *&End) const { - unsigned int Length = strlen(Tag); - unsigned int I = AlphaIndexes[AlphaHash(Tag)]; - if (I == 0) + unsigned int Pos; + if (Find(Tag, Pos) == false) return false; - I--; - - for (unsigned int Counter = 0; Counter != TagCount; Counter++, - I = (I+1)%TagCount) - { - const char *St; - St = Section + Indexes[I]; - if (strncasecmp(Tag,St,Length) != 0) - continue; - - // Make sure the colon is in the right place - const char *C = St + Length; - for (; isspace(*C) != 0; C++); - if (*C != ':') - continue; - // Strip off the gunk from the start end - Start = C; - End = Section + Indexes[I+1]; - if (Start >= End) - return _error->Error("Internal parsing error"); - - for (; (isspace(*Start) != 0 || *Start == ':') && Start < End; Start++); - for (; isspace(End[-1]) != 0 && End > Start; End--); - - return true; - } - - Start = End = 0; - return false; + Start = Section + Tags[Pos].StartValue; + // Strip off the gunk from the end + End = Section + Tags[Pos + 1].StartTag; + if (unlikely(Start > End)) + return _error->Error("Internal parsing error"); + + for (; isspace(End[-1]) != 0 && End > Start; --End); + + return true; } /*}}}*/ // TagSection::FindS - Find a string /*{{{*/ @@ -504,6 +550,13 @@ bool pkgTagSection::FindFlag(unsigned long &Flags, unsigned long Flag, return true; } /*}}}*/ +APT_PURE unsigned int pkgTagSection::Count() const { /*{{{*/ + if (Tags.empty() == true) + return 0; + // the last element is just marking the end and isn't a real one + return Tags.size() - 1; +} + /*}}}*/ // TFRewrite - Rewrite a control record /*{{{*/ // --------------------------------------------------------------------- /* This writes the control record to stdout rewriting it as necessary. The diff --git a/apt-pkg/tagfile.h b/apt-pkg/tagfile.h index d1a24ba45..b0cfab759 100644 --- a/apt-pkg/tagfile.h +++ b/apt-pkg/tagfile.h @@ -25,6 +25,8 @@ #include <stdio.h> #include <string> +#include <vector> +#include <list> #ifndef APT_8_CLEANER_HEADERS #include <apt-pkg/fileutl.h> @@ -35,23 +37,20 @@ class FileFd; class pkgTagSection { const char *Section; - // We have a limit of 256 tags per section. - unsigned int Indexes[256]; - unsigned int AlphaIndexes[0x100]; - unsigned int TagCount; + struct TagData { + unsigned int StartTag; + unsigned int EndTag; + unsigned int StartValue; + unsigned int NextInBucket; + + TagData(unsigned int const StartTag) : StartTag(StartTag), NextInBucket(0) {} + }; + std::vector<TagData> Tags; + unsigned int LookupTable[0x100]; + // dpointer placeholder (for later in case we need it) void *d; - /* This very simple hash function for the last 8 letters gives - very good performance on the debian package files */ - inline static unsigned long AlphaHash(const char *Text, const char *End = 0) - { - unsigned long Res = 0; - for (; Text != End && *Text != ':' && *Text != 0; Text++) - Res = ((unsigned long)(*Text) & 0xDF) ^ (Res << 1); - return Res & 0xFF; - } - protected: const char *Stop; @@ -69,17 +68,39 @@ class pkgTagSection unsigned long Flag) const; bool static FindFlag(unsigned long &Flags, unsigned long Flag, const char* Start, const char* Stop); - bool Scan(const char *Start,unsigned long MaxLength); + + /** \brief searches the boundaries of the current section + * + * While parameter Start marks the beginning of the section, this method + * will search for the first double newline in the data stream which marks + * the end of the section. It also does a first pass over the content of + * the section parsing it as encountered for processing later on by Find + * + * @param Start is the beginning of the section + * @param MaxLength is the size of valid data in the stream pointed to by Start + * @param Restart if enabled internal state will be cleared, otherwise it is + * assumed that now more data is available in the stream and the parsing will + * start were it encountered insufficent data the last time. + * + * @return \b true if section end was found, \b false otherwise. + * Beware that internal state will be inconsistent if \b false is returned! + */ + APT_MUSTCHECK bool Scan(const char *Start, unsigned long MaxLength, bool const Restart = true); inline unsigned long size() const {return Stop - Section;}; void Trim(); virtual void TrimRecord(bool BeforeRecord, const char* &End); - - inline unsigned int Count() const {return TagCount;}; - bool Exists(const char* const Tag); - + + /** \brief amount of Tags in the current section + * + * Note: if a Tag is mentioned repeatly it will be counted multiple + * times, but only the last occurance is available via Find methods. + */ + unsigned int Count() const; + bool Exists(const char* const Tag) const; + inline void Get(const char *&Start,const char *&Stop,unsigned int I) const - {Start = Section + Indexes[I]; Stop = Section + Indexes[I+1];} - + {Start = Section + Tags[I].StartTag; Stop = Section + Tags[I+1].StartTag;} + inline void GetSection(const char *&Start,const char *&Stop) const { Start = Section; diff --git a/cmdline/apt-extracttemplates.cc b/cmdline/apt-extracttemplates.cc index e4428e051..7be59b9f8 100644 --- a/cmdline/apt-extracttemplates.cc +++ b/cmdline/apt-extracttemplates.cc @@ -103,10 +103,12 @@ bool DebFile::DoItem(Item &I, int &Fd) if (strcmp(I.Name, "control") == 0) { delete [] Control; - Control = new char[I.Size+1]; - Control[I.Size] = 0; + Control = new char[I.Size+3]; + Control[I.Size] = '\n'; + Control[I.Size + 1] = '\n'; + Control[I.Size + 2] = '\0'; Which = IsControl; - ControlLen = I.Size; + ControlLen = I.Size + 3; // make it call the Process method below. this is so evil Fd = -2; } @@ -162,9 +164,10 @@ bool DebFile::Process(Item &/*I*/, const unsigned char *data, bool DebFile::ParseInfo() { if (Control == NULL) return false; - + pkgTagSection Section; - Section.Scan(Control, ControlLen); + if (Section.Scan(Control, ControlLen) == false) + return false; Package = Section.FindS("Package"); Version = GetInstalledVer(Package); diff --git a/test/integration/test-apt-ftparchive-src-cachedb b/test/integration/test-apt-ftparchive-src-cachedb index 1af193632..c850c739f 100755 --- a/test/integration/test-apt-ftparchive-src-cachedb +++ b/test/integration/test-apt-ftparchive-src-cachedb @@ -177,6 +177,11 @@ assert_correct_sources_file mkdir aptarchive/pool/invalid printf "meep" > aptarchive/pool/invalid/invalid_1.0.dsc testequal " +E: Could not find a record in the DSC 'aptarchive/pool/invalid/invalid_1.0.dsc'" aptftparchive sources aptarchive/pool/invalid +rm -f aptarchive/pool/invalid/invalid_1.0.dsc + +printf "meep: yes" > aptarchive/pool/invalid/invalid_1.0.dsc +testequal " E: Could not find a Source entry in the DSC 'aptarchive/pool/invalid/invalid_1.0.dsc'" aptftparchive sources aptarchive/pool/invalid rm -f aptarchive/pool/invalid/invalid_1.0.dsc diff --git a/test/libapt/tagfile_test.cc b/test/libapt/tagfile_test.cc index 1bac75b55..df618ea16 100644 --- a/test/libapt/tagfile_test.cc +++ b/test/libapt/tagfile_test.cc @@ -7,6 +7,7 @@ #include <stdlib.h> #include <string.h> #include <unistd.h> +#include <sstream> #include <gtest/gtest.h> @@ -34,3 +35,181 @@ TEST(TagFileTest,SingleField) // There is only one section in this tag file EXPECT_FALSE(tfile.Step(section)); } + +TEST(TagFileTest,MultipleSections) +{ + FileFd fd; + createTemporaryFile("bigsection", fd, NULL, "Package: pkgA\n" + "Version: 1\n" + "Size: 100\n" + "Description: aaa\n" + " aaa\n" + "\n" + "Package: pkgB\n" + "Version: 1\n" + "Flag: no\n" + "Description: bbb\n" + "\n" + "Package: pkgC\n" + "Version: 2\n" + "Flag: yes\n" + "Description:\n" + " ccc\n" + ); + + pkgTagFile tfile(&fd); + pkgTagSection section; + EXPECT_FALSE(section.Exists("Version")); + + EXPECT_TRUE(tfile.Step(section)); + EXPECT_EQ(4, section.Count()); + EXPECT_TRUE(section.Exists("Version")); + EXPECT_TRUE(section.Exists("Package")); + EXPECT_TRUE(section.Exists("Size")); + EXPECT_FALSE(section.Exists("Flag")); + EXPECT_TRUE(section.Exists("Description")); + EXPECT_EQ("pkgA", section.FindS("Package")); + EXPECT_EQ("1", section.FindS("Version")); + EXPECT_EQ(1, section.FindULL("Version")); + EXPECT_EQ(100, section.FindULL("Size")); + unsigned long Flags = 1; + EXPECT_TRUE(section.FindFlag("Flag", Flags, 1)); + EXPECT_EQ(1, Flags); + Flags = 0; + EXPECT_TRUE(section.FindFlag("Flag", Flags, 1)); + EXPECT_EQ(0, Flags); + EXPECT_EQ("aaa\n aaa", section.FindS("Description")); + + + EXPECT_TRUE(tfile.Step(section)); + EXPECT_EQ(4, section.Count()); + EXPECT_TRUE(section.Exists("Version")); + EXPECT_TRUE(section.Exists("Package")); + EXPECT_FALSE(section.Exists("Size")); + EXPECT_TRUE(section.Exists("Flag")); + EXPECT_TRUE(section.Exists("Description")); + EXPECT_EQ("pkgB", section.FindS("Package")); + EXPECT_EQ("1", section.FindS("Version")); + EXPECT_EQ(1, section.FindULL("Version")); + EXPECT_EQ(0, section.FindULL("Size")); + Flags = 1; + EXPECT_TRUE(section.FindFlag("Flag", Flags, 1)); + EXPECT_EQ(0, Flags); + Flags = 0; + EXPECT_TRUE(section.FindFlag("Flag", Flags, 1)); + EXPECT_EQ(0, Flags); + EXPECT_EQ("bbb", section.FindS("Description")); + + EXPECT_TRUE(tfile.Step(section)); + EXPECT_EQ(4, section.Count()); + EXPECT_TRUE(section.Exists("Version")); + EXPECT_TRUE(section.Exists("Package")); + EXPECT_FALSE(section.Exists("Size")); + EXPECT_TRUE(section.Exists("Flag")); + EXPECT_TRUE(section.Exists("Description")); + EXPECT_EQ("pkgC", section.FindS("Package")); + EXPECT_EQ("2", section.FindS("Version")); + EXPECT_EQ(2, section.FindULL("Version")); + Flags = 0; + EXPECT_TRUE(section.FindFlag("Flag", Flags, 1)); + EXPECT_EQ(1, Flags); + Flags = 1; + EXPECT_TRUE(section.FindFlag("Flag", Flags, 1)); + EXPECT_EQ(1, Flags); + EXPECT_EQ("ccc", section.FindS("Description")); + + // There is no section left in this tag file + EXPECT_FALSE(tfile.Step(section)); +} + +TEST(TagFileTest,BigSection) +{ + size_t const count = 500; + std::stringstream content; + for (size_t i = 0; i < count; ++i) + content << "Field-" << i << ": " << (2000 + i) << std::endl; + + FileFd fd; + createTemporaryFile("bigsection", fd, NULL, content.str().c_str()); + + pkgTagFile tfile(&fd); + pkgTagSection section; + EXPECT_TRUE(tfile.Step(section)); + + EXPECT_EQ(count, section.Count()); + for (size_t i = 0; i < count; ++i) + { + std::stringstream name; + name << "Field-" << i; + EXPECT_TRUE(section.Exists(name.str().c_str())) << name.str() << " does not exist"; + EXPECT_EQ((2000 + i), section.FindULL(name.str().c_str())); + } + + // There is only one section in this tag file + EXPECT_FALSE(tfile.Step(section)); +} + +TEST(TagFileTest, PickedUpFromPreviousCall) +{ + size_t const count = 500; + std::stringstream contentstream; + for (size_t i = 0; i < count; ++i) + contentstream << "Field-" << i << ": " << (2000 + i) << std::endl; + contentstream << std::endl << std::endl; + std::string content = contentstream.str(); + + pkgTagSection section; + EXPECT_FALSE(section.Scan(content.c_str(), content.size()/2)); + EXPECT_NE(0, section.Count()); + EXPECT_NE(count, section.Count()); + EXPECT_TRUE(section.Scan(content.c_str(), content.size(), false)); + EXPECT_EQ(count, section.Count()); + + for (size_t i = 0; i < count; ++i) + { + std::stringstream name; + name << "Field-" << i; + EXPECT_TRUE(section.Exists(name.str().c_str())) << name.str() << " does not exist"; + EXPECT_EQ((2000 + i), section.FindULL(name.str().c_str())); + } +} + +TEST(TagFileTest, SpacesEverywhere) +{ + std::string content = + "Package: pkgA\n" + "Package: pkgB\n" + "NoSpaces:yes\n" + "TagSpaces\t :yes\n" + "ValueSpaces: \tyes\n" + "BothSpaces \t:\t yes\n" + "TrailingSpaces: yes\t \n" + "Naming Space: yes\n" + "Naming Spaces: yes\n" + "Package : pkgC \n" + "Multi-Colon::yes:\n" + "\n\n"; + + pkgTagSection section; + EXPECT_TRUE(section.Scan(content.c_str(), content.size())); + EXPECT_TRUE(section.Exists("Package")); + EXPECT_TRUE(section.Exists("NoSpaces")); + EXPECT_TRUE(section.Exists("TagSpaces")); + EXPECT_TRUE(section.Exists("ValueSpaces")); + EXPECT_TRUE(section.Exists("BothSpaces")); + EXPECT_TRUE(section.Exists("TrailingSpaces")); + EXPECT_TRUE(section.Exists("Naming Space")); + EXPECT_TRUE(section.Exists("Naming Spaces")); + EXPECT_TRUE(section.Exists("Multi-Colon")); + EXPECT_EQ("pkgC", section.FindS("Package")); + EXPECT_EQ("yes", section.FindS("NoSpaces")); + EXPECT_EQ("yes", section.FindS("TagSpaces")); + EXPECT_EQ("yes", section.FindS("ValueSpaces")); + EXPECT_EQ("yes", section.FindS("BothSpaces")); + EXPECT_EQ("yes", section.FindS("TrailingSpaces")); + EXPECT_EQ("yes", section.FindS("Naming Space")); + EXPECT_EQ("yes", section.FindS("Naming Spaces")); + EXPECT_EQ(":yes:", section.FindS("Multi-Colon")); + // overridden values are still present, but not really accessible + EXPECT_EQ(11, section.Count()); +} |