From 068956f7290189f66da155ff928c5af7aa6d585e Mon Sep 17 00:00:00 2001 From: Mitch Capper Date: Wed, 27 Jul 2022 01:48:47 -0700 Subject: [PATCH 1/3] Correction de l'emplacement du meta avec l'encodage : Dans le head fix conflict --- src/htsparse.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/src/htsparse.c b/src/htsparse.c index 63ed4105..360bbe1a 100644 --- a/src/htsparse.c +++ b/src/htsparse.c @@ -386,6 +386,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) { // int emited_footer = 0; // emitted footer comment tag(s) count + int emited_footer_todo = 0; // Flag pour mise à jour différée // int parent_relative = 0; // the parent is the base path (.js, .css..) @@ -685,15 +686,19 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) { case 0: // We are looking for the first head so that we can declare the HTTP-headers charset early // Emit as soon as we see the first , , or tag. - // FIXME: we currently emit the tag BEFORE the tag, actually, which is not clean - if ((p = strfield(html, "")) != 0 - || ((p = strfield(html, "")) != 0 + if ((p = strfield(html, "")) != 0 || ((p = strfield(html, "")) != 0 + || ((p = strfield(html, " Date: Thu, 26 May 2022 20:24:21 +0200 Subject: [PATCH 2/3] =?UTF-8?q?Supprime=20les=20meta=20charset=20en=20doub?= =?UTF-8?q?le=20du=20fait=20du=20for=C3=A7age=20par=20httrack?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/htsparse.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/src/htsparse.c b/src/htsparse.c index 360bbe1a..dbd5ee69 100644 --- a/src/htsparse.c +++ b/src/htsparse.c @@ -662,6 +662,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) { /* Meta ? */ if (check_tag(intag_start, "meta")) { int pos; + int please_skip_tag = 0; // if ((pos = rech_tageq_all(html, "http-equiv"))) { @@ -673,11 +674,43 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) { intag_ctype = 1; //NOPE-we do not convert the whole page actually //intag_start[1] = 'X'; + if ((emited_footer > 0) || (emited_footer_todo > 0)) { + // Skip this tag that is redundant + please_skip_tag = 1; + } } else if (strfield(token, "refresh")) { intag_ctype = 2; } } } + else if ((pos = rech_tageq_all(html, "charset"))) { + if ((emited_footer > 0) || (emited_footer_todo > 0)) { + // Skip this tag that is redundant + please_skip_tag = 1; + } + } + + if (please_skip_tag == 1) { + if (html - r->adr < r->size) { + /* Not on a starting tag yet */ + const char *adr_next = html + 1; + + while(*adr_next != '<' && (adr_next - r->adr) < r->size) { + adr_next++; + } + /* Jump to near end (index hack) */ + if (!adr_next || *adr_next != '<') { + if (html - r->adr < r->size - 4 + && r->size > 4 + ) { + html = r->adr + r->size - 2; + } + } else { + html = adr_next; + } + } + lastsaved = html; + } } if (opt->getmode & 1) { // sauver html From 2fb46165cacb2e185a5c282fa8d42b3efd2802da Mon Sep 17 00:00:00 2001 From: Emmanuel Guyot Date: Sat, 28 May 2022 19:56:27 +0200 Subject: [PATCH 3/3] Evite d'avoir le tag base vide --- src/htsparse.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/src/htsparse.c b/src/htsparse.c index dbd5ee69..925ca169 100644 --- a/src/htsparse.c +++ b/src/htsparse.c @@ -388,6 +388,8 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) { int emited_footer = 0; // emitted footer comment tag(s) count int emited_footer_todo = 0; // Flag pour mise à jour différée + int skip_until_end_of_tag = 0; // Skip until the > of the end of tag ? + // int parent_relative = 0; // the parent is the base path (.js, .css..) @@ -713,6 +715,12 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) { } } + if (check_tag(intag_start, "base")) { + // Base tag will be empty so don't write it + html += 5; + lastsaved = html; + } + if (opt->getmode & 1) { // sauver html p = 0; switch (emited_footer) { @@ -831,6 +839,13 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) { } } } + + if (skip_until_end_of_tag == 1) { + html++; + lastsaved = html; + skip_until_end_of_tag = 0; + } + } else { /* end of comment? */ // vérifier fermeture correcte if ((*(html - 1) == '-') && (*(html - 2) == '-')) { @@ -2577,6 +2592,10 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) { // écrire lien if ((p_type == 2) || (p_type == -2)) { // base href ou codebase, sauter lastsaved = eadr - 1 + 1; // sauter " + if (check_tag(intag_start, "base")) { + // Skip until the end of the tag + skip_until_end_of_tag = 1; + } } /* */ else if (opt->urlmode == 0) { // URL absolue dans tous les cas