From fd352fbc63ffc3dafa6d201c1e7dce9d8a5e9837 Mon Sep 17 00:00:00 2001 From: Emmanuel Guyot Date: Thu, 26 May 2022 19:02:56 +0200 Subject: [PATCH 1/5] Correction de l'emplacement du meta avec l'encodage : Dans le head --- src/htsparse.c | 16 ++++++++++++---- src/htsserver.c | 2 +- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/src/htsparse.c b/src/htsparse.c index 3da51583..24eaf618 100644 --- a/src/htsparse.c +++ b/src/htsparse.c @@ -386,6 +386,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) { // int emited_footer = 0; // emitted footer comment tag(s) count + int emited_footer_todo = 0; // Flag pour mise à jour différée // int parent_relative = 0; // the parent is the base path (.js, .css..) @@ -685,15 +686,19 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) { case 0: // We are looking for the first head so that we can declare the HTTP-headers charset early // Emit as soon as we see the first , , or tag. - // FIXME: we currently emit the tag BEFORE the tag, actually, which is not clean - if ((p = strfield(html, "")) != 0 - || ((p = strfield(html, "")) != 0 + if ((p = strfield(html, "")) != 0 || ((p = strfield(html, "")) != 0 + || ((p = strfield(html, ""}, + ""}, {"url2", "+*.png +*.gif +*.jpg +*.jpeg +*.css +*.js -ad.doubleclick.net/*"}, {NULL, NULL} }; From 43646bad53d432ebdba0f8f28e5727827c7374fb Mon Sep 17 00:00:00 2001 From: Emmanuel Guyot Date: Thu, 26 May 2022 20:24:21 +0200 Subject: [PATCH 2/5] =?UTF-8?q?Supprime=20les=20meta=20charset=20en=20doub?= =?UTF-8?q?le=20du=20fait=20du=20for=C3=A7age=20par=20httrack?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/htsparse.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/src/htsparse.c b/src/htsparse.c index 24eaf618..6dcbf360 100644 --- a/src/htsparse.c +++ b/src/htsparse.c @@ -662,6 +662,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) { /* Meta ? */ if (check_tag(intag_start, "meta")) { int pos; + int please_skip_tag = 0; // if ((pos = rech_tageq_all(html, "http-equiv"))) { @@ -673,11 +674,43 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) { intag_ctype = 1; //NOPE-we do not convert the whole page actually //intag_start[1] = 'X'; + if ((emited_footer > 0) || (emited_footer_todo > 0)) { + // Skip this tag that is redundant + please_skip_tag = 1; + } } else if (strfield(token, "refresh")) { intag_ctype = 2; } } } + else if ((pos = rech_tageq_all(html, "charset"))) { + if ((emited_footer > 0) || (emited_footer_todo > 0)) { + // Skip this tag that is redundant + please_skip_tag = 1; + } + } + + if (please_skip_tag == 1) { + if (html - r->adr < r->size) { + /* Not on a starting tag yet */ + const char *adr_next = html + 1; + + while(*adr_next != '<' && (adr_next - r->adr) < r->size) { + adr_next++; + } + /* Jump to near end (index hack) */ + if (!adr_next || *adr_next != '<') { + if (html - r->adr < r->size - 4 + && r->size > 4 + ) { + html = r->adr + r->size - 2; + } + } else { + html = adr_next; + } + } + lastsaved = html; + } } if (opt->getmode & 1) { // sauver html From 2d833f8c3b7ed2acda52bedf4350535c1bf255ab Mon Sep 17 00:00:00 2001 From: Emmanuel Guyot Date: Sat, 28 May 2022 19:56:27 +0200 Subject: [PATCH 3/5] Evite d'avoir le tag base vide --- src/htsparse.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/src/htsparse.c b/src/htsparse.c index 6dcbf360..128e83d1 100644 --- a/src/htsparse.c +++ b/src/htsparse.c @@ -388,6 +388,8 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) { int emited_footer = 0; // emitted footer comment tag(s) count int emited_footer_todo = 0; // Flag pour mise à jour différée + int skip_until_end_of_tag = 0; // Skip until the > of the end of tag ? + // int parent_relative = 0; // the parent is the base path (.js, .css..) @@ -713,6 +715,12 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) { } } + if (check_tag(intag_start, "base")) { + // Base tag will be empty so don't write it + html += 5; + lastsaved = html; + } + if (opt->getmode & 1) { // sauver html p = 0; switch (emited_footer) { @@ -831,6 +839,13 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) { } } } + + if (skip_until_end_of_tag == 1) { + html++; + lastsaved = html; + skip_until_end_of_tag = 0; + } + } else { /* end of comment? */ // vérifier fermeture correcte if ((*(html - 1) == '-') && (*(html - 2) == '-')) { @@ -2577,6 +2592,10 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) { // écrire lien if ((p_type == 2) || (p_type == -2)) { // base href ou codebase, sauter lastsaved = eadr - 1 + 1; // sauter " + if (check_tag(intag_start, "base")) { + // Skip until the end of the tag + skip_until_end_of_tag = 1; + } } /* */ else if (opt->urlmode == 0) { // URL absolue dans tous les cas From e6a59fce4c8bba8d8c4fb2fa6eb36944e98d2d0f Mon Sep 17 00:00:00 2001 From: Emmanuel Guyot Date: Sun, 28 Jan 2024 17:38:53 +0100 Subject: [PATCH 4/5] =?UTF-8?q?Correction=20si=20meta=20tag=20derri=C3=A8r?= =?UTF-8?q?e=20le=20content=20type.=20Issue=20https://github.com/mitchcapp?= =?UTF-8?q?er/httrack/issues/26?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/htsparse.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/htsparse.c b/src/htsparse.c index 128e83d1..22b1a0c1 100644 --- a/src/htsparse.c +++ b/src/htsparse.c @@ -673,13 +673,15 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) { if (len > 0) { if (strfield(token, "content-type")) { - intag_ctype = 1; - //NOPE-we do not convert the whole page actually - //intag_start[1] = 'X'; if ((emited_footer > 0) || (emited_footer_todo > 0)) { // Skip this tag that is redundant please_skip_tag = 1; } + else { + intag_ctype = 1; + //NOPE-we do not convert the whole page actually + //intag_start[1] = 'X'; + } } else if (strfield(token, "refresh")) { intag_ctype = 2; } From 62399ca3d951362724e2cac3c1bd65cf943c9f5d Mon Sep 17 00:00:00 2001 From: Emmanuel Guyot Date: Sun, 28 Jan 2024 18:14:50 +0100 Subject: [PATCH 5/5] =?UTF-8?q?R=C3=A9tablissement=20commentaire=20par=20d?= =?UTF-8?q?=C3=A9faut=20pour=20merge=20request?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/htsserver.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/htsserver.c b/src/htsserver.c index d3893abe..4e3babc4 100644 --- a/src/htsserver.c +++ b/src/htsserver.c @@ -359,7 +359,7 @@ int smallserver(T_SOC soc, char *url, char *method, char *data, char *path) { initStrElt initStr[] = { {"user", "Mozilla/4.5 (compatible; HTTrack 3.0x; Windows 98)"}, {"footer", - ""}, + ""}, {"url2", "+*.png +*.gif +*.jpg +*.jpeg +*.css +*.js -ad.doubleclick.net/*"}, {NULL, NULL} };