NetSurf
save_complete.c
Go to the documentation of this file.
1/*
2 * Copyright 2012 John-Mark Bell <jmb@netsurf-browser.org>
3 * Copyright 2004-2007 James Bursa <bursa@users.sourceforge.net>
4 *
5 * This file is part of NetSurf, http://www.netsurf-browser.org/
6 *
7 * NetSurf is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; version 2 of the License.
10 *
11 * NetSurf is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20/**
21 * \file
22 * Save HTML document with dependencies implementation.
23 */
24
25#include <assert.h>
26#include <errno.h>
27#include <stdio.h>
28#include <string.h>
29#include <strings.h>
30#include <sys/types.h>
31#include <dom/dom.h>
32
33#include "utils/config.h"
34#include "utils/regex.h"
35#include "utils/corestrings.h"
36#include "utils/log.h"
37#include "utils/nsurl.h"
38#include "utils/utf8.h"
39#include "utils/utils.h"
40#include "utils/file.h"
41#include "utils/messages.h"
42#include "utils/ascii.h"
43#include "netsurf/content.h"
44#include "content/hlcache.h"
45#include "css/css.h"
46#include "html/box.h"
47#include "html/html_save.h"
48#include "html/html.h"
49
50#include "netsurf/misc.h"
53
55
56/** An entry in save_complete_list. */
57typedef struct save_complete_entry {
59 struct save_complete_entry *next; /**< Next entry in list */
61
62typedef struct save_complete_ctx {
63 const char *path;
66
68 FILE *fp;
71
72typedef enum {
76
77
78static nserror save_complete_save_html(save_complete_ctx *ctx, struct hlcache_handle *c, bool index);
80 struct nscss_import *imports, uint32_t import_count);
81
82
84 const char *path, save_complete_set_type_cb set_type)
85{
86 ctx->path = path;
87 ctx->list = NULL;
88 ctx->set_type = set_type;
89}
90
92{
93 save_complete_entry *list = ctx->list;
94
95 while (list != NULL) {
96 save_complete_entry *next = list->next;
97 free(list);
98 list = next;
99 }
100}
101
102static nserror
104 struct hlcache_handle *content)
105{
106 save_complete_entry *entry;
107
108 entry = malloc(sizeof (*entry));
109 if (entry == NULL) {
110 return NSERROR_NOMEM;
111 }
112
113 entry->content = content;
114 entry->next = ctx->list;
115 ctx->list = entry;
116
117 return NSERROR_OK;
118}
119
120/**
121 * find handle to content for url
122 *
123 * \param ctx The save context
124 * \param url The url to find content handle for
125 * \return The content handle or NULL if not found.
126 */
127static struct hlcache_handle *
129{
131
132 for (entry = ctx->list; entry != NULL; entry = entry->next) {
133 if (nsurl_compare(url,
136 return entry->content;
137 }
138 }
139
140 return NULL;
141}
142
143
144static bool
146 struct hlcache_handle *content)
147{
149
150 for (entry = ctx->list; entry != NULL; entry = entry->next) {
153 return true;
154 }
155
156 return false;
157}
158
159static nserror
161 const char *leafname,
162 const uint8_t *data,
163 size_t data_len,
164 lwc_string *mime_type)
165{
166 nserror ret;
167 FILE *fp;
168 char *fname = NULL;
169
170 ret = netsurf_mkpath(&fname, NULL, 2, ctx->path, leafname);
171 if (ret != NSERROR_OK) {
172 return ret;
173 }
174
175 fp = fopen(fname, "wb");
176 if (fp == NULL) {
177 free(fname);
178 NSLOG(netsurf, INFO, "fopen(): %s", strerror(errno));
179 return NSERROR_SAVE_FAILED;
180 }
181
182 fwrite(data, sizeof(*data), data_len, fp);
183
184 fclose(fp);
185
186 if (ctx->set_type != NULL) {
187 ctx->set_type(fname, mime_type);
188 }
189 free(fname);
190
191 return NSERROR_OK;
192}
193
194
195/**
196 * perform a posix regexec on a string without a null terminator
197 */
198static int
200 const char *string,
201 size_t stringlen,
202 size_t nmatch,
203 regmatch_t pmatch[],
204 int eflags)
205{
206 char *strbuf;
207 int matches;
208
209 strbuf = calloc(1, stringlen + 1);
210 if (strbuf == NULL) {
211 return -1;
212 }
213 memcpy(strbuf, string, stringlen);
214
215 matches = regexec(preg, strbuf, nmatch, pmatch, eflags);
216
217 free(strbuf);
218
219 return matches;
220}
221
222
223/**
224 * Rewrite stylesheet \@import rules for save complete.
225 *
226 * \param ctx Save complete context.
227 * \param source stylesheet source.
228 * \param size size of source.
229 * \param base url of stylesheet.
230 * \param osize updated with the size of the result.
231 * \return converted source, or NULL on out of memory.
232 */
233static uint8_t *
235 const uint8_t *source,
236 size_t size,
237 const nsurl *base,
238 size_t *osize)
239{
240 uint8_t *rewritten;
241 unsigned long offset = 0;
242 unsigned int imports = 0;
243 nserror error;
244
245 /* count number occurrences of @import to (over)estimate result size */
246 /* can't use strstr because source is not 0-terminated string */
247 for (offset = 0;
248 (SLEN("@import") < size) && (offset <= (size - SLEN("@import")));
249 offset++) {
250 if (source[offset] == '@' &&
251 ascii_to_lower(source[offset + 1]) == 'i' &&
252 ascii_to_lower(source[offset + 2]) == 'm' &&
253 ascii_to_lower(source[offset + 3]) == 'p' &&
254 ascii_to_lower(source[offset + 4]) == 'o' &&
255 ascii_to_lower(source[offset + 5]) == 'r' &&
256 ascii_to_lower(source[offset + 6]) == 't') {
257 imports++;
258 }
259 }
260
261 rewritten = malloc(size + imports * 20);
262 if (rewritten == NULL)
263 return NULL;
264 *osize = 0;
265
266 offset = 0;
267 while (offset < size) {
268 const uint8_t *import_url = NULL;
269 char *import_url_copy;
270 int import_url_len = 0;
271 nsurl *url = NULL;
272 regmatch_t match[11];
273 int m;
274
276 (const char *)source + offset,
277 size - offset,
278 11,
279 match,
280 0);
281 if (m)
282 break;
283
284 if (match[2].rm_so != -1) {
285 import_url = source + offset + match[2].rm_so;
286 import_url_len = match[2].rm_eo - match[2].rm_so;
287 } else if (match[4].rm_so != -1) {
288 import_url = source + offset + match[4].rm_so;
289 import_url_len = match[4].rm_eo - match[4].rm_so;
290 } else if (match[6].rm_so != -1) {
291 import_url = source + offset + match[6].rm_so;
292 import_url_len = match[6].rm_eo - match[6].rm_so;
293 } else if (match[8].rm_so != -1) {
294 import_url = source + offset + match[8].rm_so;
295 import_url_len = match[8].rm_eo - match[8].rm_so;
296 } else if (match[10].rm_so != -1) {
297 import_url = source + offset + match[10].rm_so;
298 import_url_len = match[10].rm_eo - match[10].rm_so;
299 }
300 assert(import_url != NULL);
301
302 import_url_copy = strndup((const char *)import_url,
303 import_url_len);
304 if (import_url_copy == NULL) {
305 free(rewritten);
306 return NULL;
307 }
308
309 error = nsurl_join(base, import_url_copy, &url);
310 free(import_url_copy);
311 if (error == NSERROR_NOMEM) {
312 free(rewritten);
313 return NULL;
314 }
315
316 /* copy data before match */
317 memcpy(rewritten + *osize, source + offset, match[0].rm_so);
318 *osize += match[0].rm_so;
319
320 if (url != NULL) {
323 if (content != NULL) {
324 /* replace import */
325 char buf[64];
326 snprintf(buf, sizeof buf, "@import '%p'",
327 content);
328 memcpy(rewritten + *osize, buf, strlen(buf));
329 *osize += strlen(buf);
330 } else {
331 /* copy import */
332 memcpy(rewritten + *osize,
333 source + offset + match[0].rm_so,
334 match[0].rm_eo - match[0].rm_so);
335 *osize += match[0].rm_eo - match[0].rm_so;
336 }
337 nsurl_unref(url);
338 } else {
339 /* copy import */
340 memcpy(rewritten + *osize,
341 source + offset + match[0].rm_so,
342 match[0].rm_eo - match[0].rm_so);
343 *osize += match[0].rm_eo - match[0].rm_so;
344 }
345
346 assert(0 < match[0].rm_eo);
347 offset += match[0].rm_eo;
348 }
349
350 /* copy rest of source */
351 if (offset < size) {
352 memcpy(rewritten + *osize, source + offset, size - offset);
353 *osize += size - offset;
354 }
355
356 return rewritten;
357}
358
359static nserror
361{
362 const uint8_t *css_data;
363 size_t css_size;
364 uint8_t *source;
365 size_t source_len;
366 struct nscss_import *imports;
367 uint32_t import_count;
368 lwc_string *type;
369 char filename[32];
371
373 hlcache_handle_get_url(css)) != NULL) {
374 return NSERROR_OK;
375 }
376
378 if (result != NSERROR_OK) {
379 return result;
380 }
381
382 imports = nscss_get_imports(css, &import_count);
384 imports,
385 import_count);
386 if (result != NSERROR_OK) {
387 return result;
388 }
389
390 css_data = content_get_source_data(css, &css_size);
392 ctx,
393 css_data,
394 css_size,
396 &source_len);
397 if (source == NULL) {
398 return NSERROR_NOMEM;
399 }
400
402 if (type == NULL) {
403 free(source);
404 return NSERROR_NOMEM;
405 }
406
407 snprintf(filename, sizeof filename, "%p", css);
408
409 result = save_complete_save_buffer(ctx, filename,
410 source, source_len, type);
411
412 lwc_string_unref(type);
413 free(source);
414
415 return result;
416}
417
418static nserror
420 struct nscss_import *imports,
421 uint32_t import_count)
422{
423 nserror res = NSERROR_OK;
424 uint32_t i;
425
426 for (i = 0; i < import_count; i++) {
427 /* treat a valid content as a stylesheet to save */
428 if (imports[i].c != NULL) {
429 res = save_complete_save_stylesheet(ctx, imports[i].c);
430 if (res != NSERROR_OK) {
431 return res;
432 }
433 }
434 }
435
436 return res;
437}
438
439static nserror
441 struct html_stylesheet *sheet)
442{
443 if (sheet->sheet == NULL) {
444 return NSERROR_OK;
445 }
446
447 return save_complete_save_stylesheet(ctx, sheet->sheet);
448}
449
450static nserror
453{
454 struct html_stylesheet *sheets;
455 unsigned int i, count;
456 nserror res;
457
458 sheets = html_get_stylesheets(c, &count);
459
460 for (i = STYLESHEET_START; i != count; i++) {
461 res = save_complete_save_html_stylesheet(ctx, &sheets[i]);
462 if (res != NSERROR_OK) {
463 return res;
464 }
465 }
466
467 return NSERROR_OK;
468}
469
470static nserror
472{
473 const uint8_t *obj_data;
474 size_t obj_size;
475 lwc_string *type;
477 char filename[32];
478
479 if (content_get_type(obj) == CONTENT_NONE) {
480 return NSERROR_OK;
481 }
482
483 obj_data = content_get_source_data(obj, &obj_size);
484 if (obj_data == NULL) {
485 return NSERROR_OK;
486 }
487
489 hlcache_handle_get_url(obj)) != NULL) {
490 return NSERROR_OK;
491 }
492
494 if (result != NSERROR_OK) {
495 return result;
496 }
497
498 if (content_get_type(obj) == CONTENT_HTML) {
499 return save_complete_save_html(ctx, obj, false);
500 }
501
502 snprintf(filename, sizeof filename, "%p", obj);
503
505 if (type == NULL) {
506 return NSERROR_NOMEM;
507 }
508
509 result = save_complete_save_buffer(ctx, filename, obj_data, obj_size, type);
510
511 lwc_string_unref(type);
512
513 return result;
514}
515
516static nserror
519{
520 struct content_html_object *object;
521 unsigned int count;
522 nserror res;
523
524 object = html_get_objects(c, &count);
525
526 for (; object != NULL; object = object->next) {
527 if ((object->content != NULL) &&
528 (object->box != NULL)) {
529 res = save_complete_save_html_object(ctx, object->content);
530 if (res != NSERROR_OK) {
531 return res;
532 }
533 }
534 }
535
536 return NSERROR_OK;
537}
538
539static bool
541 bool (*callback)(dom_node *node,
543 void *ctx),
544 void *ctx)
545{
546 dom_node *node;
547
548 node = dom_node_ref(root); /* tree root */
549
550 while (node != NULL) {
551 dom_node *next = NULL;
552 dom_exception exc;
553
554 exc = dom_node_get_first_child(node, &next);
555 if (exc != DOM_NO_ERR) {
556 dom_node_unref(node);
557 break;
558 }
559
560 if (next != NULL) { /* 1. children */
561 dom_node_unref(node);
562 node = next;
563 } else {
564 exc = dom_node_get_next_sibling(node, &next);
565 if (exc != DOM_NO_ERR) {
566 dom_node_unref(node);
567 break;
568 }
569
570 if (next != NULL) { /* 2. siblings */
571 if (callback(node, EVENT_LEAVE, ctx) == false) {
572 return false;
573 }
574 dom_node_unref(node);
575 node = next;
576 } else { /* 3. ancestor siblings */
577 while (node != NULL) {
578 exc = dom_node_get_next_sibling(node,
579 &next);
580 if (exc != DOM_NO_ERR) {
581 dom_node_unref(node);
582 node = NULL;
583 break;
584 }
585
586 if (next != NULL) {
587 dom_node_unref(next);
588 break;
589 }
590
591 exc = dom_node_get_parent_node(node,
592 &next);
593 if (exc != DOM_NO_ERR) {
594 dom_node_unref(node);
595 node = NULL;
596 break;
597 }
598
599 if (callback(node, EVENT_LEAVE,
600 ctx) == false) {
601 return false;
602 }
603 dom_node_unref(node);
604 node = next;
605 }
606
607 if (node == NULL)
608 break;
609
610 exc = dom_node_get_next_sibling(node, &next);
611 if (exc != DOM_NO_ERR) {
612 dom_node_unref(node);
613 break;
614 }
615
616 if (callback(node, EVENT_LEAVE, ctx) == false) {
617 return false;
618 }
619 dom_node_unref(node);
620 node = next;
621 }
622 }
623
624 assert(node != NULL);
625
626 if (callback(node, EVENT_ENTER, ctx) == false) {
627 return false; /* callback caused early termination */
628 }
629
630 }
631
632 return true;
633}
634
636 const char *value, size_t value_len)
637{
638 nsurl *url;
640 char *escaped;
641 nserror error;
642
643 error = nsurl_join(ctx->base, value, &url);
644 if (error == NSERROR_NOMEM)
645 return false;
646
647 if (url != NULL) {
649 if (content != NULL) {
650 /* found a match */
651 nsurl_unref(url);
652
653 fprintf(ctx->fp, "\"%p\"", content);
654 } else {
655 /* no match found */
656 error = utf8_to_html(nsurl_access(url), "UTF-8",
657 nsurl_length(url), &escaped);
658 nsurl_unref(url);
659
660 if (error != NSERROR_OK)
661 return false;
662
663 fprintf(ctx->fp, "\"%s\"", escaped);
664
665 free(escaped);
666 }
667 } else {
668 error = utf8_to_html(value, "UTF-8", value_len, &escaped);
669 if (error != NSERROR_OK)
670 return false;
671
672 fprintf(ctx->fp, "\"%s\"", escaped);
673
674 free(escaped);
675 }
676
677 return true;
678}
679
681 const char *value, size_t value_len)
682{
683 char *escaped;
684 nserror ret;
685
686 ret = utf8_to_html(value, "UTF-8", value_len, &escaped);
687 if (ret != NSERROR_OK)
688 return false;
689
690 fprintf(ctx->fp, "\"%s\"", escaped);
691
692 free(escaped);
693
694 return true;
695}
696
698 dom_string *node_name, dom_string *attr_name,
699 dom_string *attr_value)
700{
701 const char *node_data = dom_string_data(node_name);
702 size_t node_len = dom_string_byte_length(node_name);
703 const char *name_data = dom_string_data(attr_name);
704 size_t name_len = dom_string_byte_length(attr_name);
705 const char *value_data = dom_string_data(attr_value);
706 size_t value_len = dom_string_byte_length(attr_value);
707
708 /**
709 * We only need to consider the following cases:
710 *
711 * Attribute: Elements:
712 *
713 * 1) data object
714 * 2) href a, area, link
715 * 3) src script, input, frame, iframe, img
716 * 4) background any (except those above)
717 */
718 /* 1 */
719 if (name_len == SLEN("data") &&
720 strncasecmp(name_data, "data", name_len) == 0) {
721 if (node_len == SLEN("object") &&
722 strncasecmp(node_data,
723 "object", node_len) == 0) {
725 value_data, value_len);
726 } else {
727 return save_complete_write_value(ctx,
728 value_data, value_len);
729 }
730 }
731 /* 2 */
732 else if (name_len == SLEN("href") &&
733 strncasecmp(name_data, "href", name_len) == 0) {
734 if ((node_len == SLEN("a") &&
735 strncasecmp(node_data, "a", node_len) == 0) ||
736 (node_len == SLEN("area") &&
737 strncasecmp(node_data, "area",
738 node_len) == 0) ||
739 (node_len == SLEN("link") &&
740 strncasecmp(node_data, "link",
741 node_len) == 0)) {
743 value_data, value_len);
744 } else {
745 return save_complete_write_value(ctx,
746 value_data, value_len);
747 }
748 }
749 /* 3 */
750 else if (name_len == SLEN("src") &&
751 strncasecmp(name_data, "src", name_len) == 0) {
752 if ((node_len == SLEN("frame") &&
753 strncasecmp(node_data, "frame",
754 node_len) == 0) ||
755 (node_len == SLEN("iframe") &&
756 strncasecmp(node_data, "iframe",
757 node_len) == 0) ||
758 (node_len == SLEN("input") &&
759 strncasecmp(node_data, "input",
760 node_len) == 0) ||
761 (node_len == SLEN("img") &&
762 strncasecmp(node_data, "img",
763 node_len) == 0) ||
764 (node_len == SLEN("script") &&
765 strncasecmp(node_data, "script",
766 node_len) == 0)) {
768 value_data, value_len);
769 } else {
770 return save_complete_write_value(ctx,
771 value_data, value_len);
772 }
773 }
774 /* 4 */
775 else if (name_len == SLEN("background") &&
776 strncasecmp(name_data, "background", name_len) == 0) {
778 value_data, value_len);
779 } else {
780 return save_complete_write_value(ctx,
781 value_data, value_len);
782 }
783}
784
785static bool
787 dom_string *node_name,
788 dom_attr *attr)
789{
790 dom_string *name;
791 const char *name_data;
792 size_t name_len;
793 dom_string *value;
794 dom_exception error;
795
796 error = dom_attr_get_name(attr, &name);
797 if (error != DOM_NO_ERR)
798 return false;
799
800 if (name == NULL)
801 return true;
802
803 error = dom_attr_get_value(attr, &value);
804 if (error != DOM_NO_ERR) {
805 dom_string_unref(name);
806 return false;
807 }
808
809 name_data = dom_string_data(name);
810 name_len = dom_string_byte_length(name);
811
812 fputc(' ', ctx->fp);
813 fwrite(name_data, sizeof(*name_data), name_len, ctx->fp);
814
815 if (value != NULL) {
816 fputc('=', ctx->fp);
818 name, value) == false) {
819 dom_string_unref(value);
820 dom_string_unref(name);
821 return false;
822 }
823 dom_string_unref(value);
824 }
825
826 dom_string_unref(name);
827
828 return true;
829}
830
831static bool
833 dom_string *node_name,
834 dom_namednodemap *attrs)
835{
836 uint32_t length, i;
837 dom_exception error;
838
839 error = dom_namednodemap_get_length(attrs, &length);
840 if (error != DOM_NO_ERR)
841 return false;
842
843 for (i = 0; i < length; i++) {
844 dom_attr *attr;
845
846 error = dom_namednodemap_item(attrs, i, (void *) &attr);
847 if (error != DOM_NO_ERR)
848 return false;
849
850 if (attr == NULL)
851 continue;
852
853 if (save_complete_handle_attr(ctx, node_name, attr) == false) {
854 dom_node_unref(attr);
855 return false;
856 }
857
858 dom_node_unref(attr);
859 }
860
861 return true;
862}
863
864static bool
866 dom_node *node,
868{
869 dom_string *name;
870 dom_namednodemap *attrs;
871 const char *name_data;
872 size_t name_len;
873 bool process = true;
874 dom_exception error;
875
876 ctx->iter_state = STATE_NORMAL;
877
878 error = dom_node_get_node_name(node, &name);
879 if (error != DOM_NO_ERR)
880 return false;
881
882 if (name == NULL)
883 return true;
884
885 name_data = dom_string_data(name);
886 name_len = dom_string_byte_length(name);
887
888 if ((name_len == SLEN("base")) &&
889 (strncasecmp(name_data, "base", name_len) == 0)) {
890 /* Elide BASE elements from the output */
891 process = false;
892 } else if ((name_len == SLEN("meta")) &&
893 (strncasecmp(name_data, "meta", name_len) == 0)) {
894 /* Don't emit close tags for META elements */
895 if (event_type == EVENT_LEAVE) {
896 process = false;
897 } else {
898 /* Elide meta charsets */
899 dom_string *value;
900 error = dom_element_get_attribute(node,
901 corestring_dom_http_equiv,
902 &value);
903 if (error != DOM_NO_ERR) {
904 dom_string_unref(name);
905 return false;
906 }
907
908 if (value != NULL) {
909 if (dom_string_length(value) ==
910 SLEN("Content-Type") &&
911 strncasecmp(dom_string_data(value),
912 "Content-Type",
913 SLEN("Content-Type")) == 0)
914 process = false;
915
916 dom_string_unref(value);
917 } else {
918 bool yes;
919
920 error = dom_element_has_attribute(node,
921 corestring_dom_charset, &yes);
922 if (error != DOM_NO_ERR) {
923 dom_string_unref(name);
924 return false;
925 }
926
927 if (yes)
928 process = false;
929 }
930 }
931 } else if (event_type == EVENT_LEAVE &&
932 ((name_len == SLEN("link") &&
933 strncasecmp(name_data, "link", name_len) == 0))) {
934 /* Don't emit close tags for void elements */
935 process = false;
936 }
937
938 if (process == false) {
939 dom_string_unref(name);
940 return true;
941 }
942
943 fputc('<', ctx->fp);
944 if (event_type == EVENT_LEAVE) {
945 fputc('/', ctx->fp);
946 }
947 fwrite(name_data, sizeof(*name_data), name_len, ctx->fp);
948
949 if (event_type == EVENT_ENTER) {
950 error = dom_node_get_attributes(node, &attrs);
951 if (error != DOM_NO_ERR) {
952 dom_string_unref(name);
953 return false;
954 }
955
956 if (save_complete_handle_attrs(ctx, name, attrs) == false) {
957 dom_namednodemap_unref(attrs);
958 dom_string_unref(name);
959 return false;
960 }
961
962 dom_namednodemap_unref(attrs);
963 }
964
965 fputc('>', ctx->fp);
966
967 /* Rewrite contents of style elements */
968 if (event_type == EVENT_ENTER && name_len == SLEN("style") &&
969 strncasecmp(name_data, "style", name_len) == 0) {
970 dom_string *content;
971
972 error = dom_node_get_text_content(node, &content);
973 if (error != DOM_NO_ERR) {
974 dom_string_unref(name);
975 return false;
976 }
977
978 if (content != NULL) {
979 uint8_t *rewritten;
980 size_t len;
981
982 /* Rewrite @import rules */
984 ctx,
985 (const uint8_t *)dom_string_data(content),
986 dom_string_byte_length(content),
987 ctx->base,
988 &len);
989 if (rewritten == NULL) {
990 dom_string_unref(content);
991 dom_string_unref(name);
992 return false;
993 }
994
995 dom_string_unref(content);
996
997 fwrite(rewritten, sizeof(*rewritten), len, ctx->fp);
998
999 free(rewritten);
1000 }
1001
1002 ctx->iter_state = STATE_IN_STYLE;
1003 } else if (event_type == EVENT_ENTER && name_len == SLEN("head") &&
1004 strncasecmp(name_data, "head", name_len) == 0) {
1005 /* If this is a HEAD element, insert a meta charset */
1006 fputs("<META http-equiv=\"Content-Type\" "
1007 "content=\"text/html; charset=utf-8\">",
1008 ctx->fp);
1009 }
1010
1011 dom_string_unref(name);
1012
1013 return true;
1014}
1015
1016static bool
1019 void *ctxin)
1020{
1021 save_complete_ctx *ctx = ctxin;
1022 dom_node_type type;
1023 dom_exception error;
1024 nserror ret;
1025
1026 error = dom_node_get_node_type(node, &type);
1027 if (error != DOM_NO_ERR)
1028 return false;
1029
1030 if (type == DOM_ELEMENT_NODE) {
1031 return save_complete_handle_element(ctx, node, event_type);
1032 } else if (type == DOM_TEXT_NODE || type == DOM_COMMENT_NODE) {
1033 if (event_type != EVENT_ENTER)
1034 return true;
1035
1036 if (ctx->iter_state != STATE_IN_STYLE) {
1037 /* Emit text content */
1038 dom_string *text;
1039 const char *text_data;
1040 size_t text_len;
1041
1042 error = dom_characterdata_get_data(node, &text);
1043 if (error != DOM_NO_ERR) {
1044 return false;
1045 }
1046
1047 if (type == DOM_COMMENT_NODE)
1048 fwrite("<!--", 1, sizeof("<!--") - 1, ctx->fp);
1049
1050 if (text != NULL) {
1051 char *escaped;
1052
1053 text_data = dom_string_data(text);
1054 text_len = dom_string_byte_length(text);
1055
1056 ret = utf8_to_html(text_data, "UTF-8",
1057 text_len, &escaped);
1058 if (ret != NSERROR_OK)
1059 return false;
1060
1061 fwrite(escaped, sizeof(*escaped),
1062 strlen(escaped), ctx->fp);
1063
1064 free(escaped);
1065
1066 dom_string_unref(text);
1067 }
1068
1069 if (type == DOM_COMMENT_NODE) {
1070 fwrite("-->", 1, sizeof("-->") - 1, ctx->fp);
1071 }
1072 }
1073
1074 } else if (type == DOM_DOCUMENT_TYPE_NODE) {
1075 dom_string *name;
1076 const char *name_data;
1077 size_t name_len;
1078
1079 if (event_type != EVENT_ENTER)
1080 return true;
1081
1082 error = dom_document_type_get_name(node, &name);
1083 if (error != DOM_NO_ERR)
1084 return false;
1085
1086 if (name == NULL)
1087 return true;
1088
1089 name_data = dom_string_data(name);
1090 name_len = dom_string_byte_length(name);
1091
1092 fputs("<!DOCTYPE ", ctx->fp);
1093 fwrite(name_data, sizeof(*name_data), name_len, ctx->fp);
1094
1095 dom_string_unref(name);
1096
1097 error = dom_document_type_get_public_id(node, &name);
1098 if (error != DOM_NO_ERR)
1099 return false;
1100
1101 if (name != NULL) {
1102 name_data = dom_string_data(name);
1103 name_len = dom_string_byte_length(name);
1104
1105 if (name_len > 0)
1106 fprintf(ctx->fp, " PUBLIC \"%.*s\"",
1107 (int) name_len, name_data);
1108
1109 dom_string_unref(name);
1110 }
1111
1112 error = dom_document_type_get_system_id(node, &name);
1113 if (error != DOM_NO_ERR)
1114 return false;
1115
1116 if (name != NULL) {
1117 name_data = dom_string_data(name);
1118 name_len = dom_string_byte_length(name);
1119
1120 if (name_len > 0)
1121 fprintf(ctx->fp, " \"%.*s\"",
1122 (int) name_len, name_data);
1123
1124 dom_string_unref(name);
1125 }
1126
1127 fputc('>', ctx->fp);
1128 } else if (type == DOM_DOCUMENT_NODE) {
1129 /* Do nothing */
1130 } else {
1131 NSLOG(netsurf, INFO, "Unhandled node type: %d", type);
1132 }
1133
1134 return true;
1135}
1136
1137static nserror
1139 hlcache_handle *c,
1140 bool index)
1141{
1142 nserror ret;
1143 FILE *fp;
1144 char *fname = NULL;
1145 dom_document *doc;
1146 lwc_string *mime_type;
1147 char filename[32];
1148
1149 if (index) {
1150 snprintf(filename, sizeof filename, "index");
1151 } else {
1152 snprintf(filename, sizeof filename, "%p", c);
1153 }
1154
1155 ret = netsurf_mkpath(&fname, NULL, 2, ctx->path, filename);
1156 if (ret != NSERROR_OK) {
1157 return ret;
1158 }
1159
1160 fp = fopen(fname, "wb");
1161 if (fp == NULL) {
1162 free(fname);
1163 NSLOG(netsurf, INFO, "fopen(): %s", strerror(errno));
1164 return NSERROR_SAVE_FAILED;
1165 }
1166
1167 ctx->base = html_get_base_url(c);
1168 ctx->fp = fp;
1169 ctx->iter_state = STATE_NORMAL;
1170
1171 doc = html_get_document(c);
1172
1173 if (save_complete_libdom_treewalk((dom_node *)doc,
1175 ctx) == false) {
1176 free(fname);
1177 fclose(fp);
1178 return NSERROR_NOMEM;
1179 }
1180
1181 fclose(fp);
1182
1183 mime_type = content_get_mime_type(c);
1184 if (mime_type != NULL) {
1185 if (ctx->set_type != NULL) {
1186 ctx->set_type(fname, mime_type);
1187 }
1188
1189 lwc_string_unref(mime_type);
1190 }
1191 free(fname);
1192
1193 return NSERROR_OK;
1194}
1195
1196/**
1197 * Save an HTML page with all dependencies, recursing through imported pages.
1198 *
1199 * \param ctx Save complete context
1200 * \param c Content to save
1201 * \param index true to save as "index"
1202 * \return true on success, false on error and error reported
1203 */
1204static nserror
1206 hlcache_handle *c,
1207 bool index)
1208{
1209 nserror res;
1210
1211 if (content_get_type(c) != CONTENT_HTML) {
1212 return NSERROR_INVALID;
1213 }
1214
1215 if (save_complete_ctx_has_content(ctx, c)) {
1216 return NSERROR_OK;
1217 }
1218
1220 if (res != NSERROR_OK) {
1221 return res;
1222 }
1223
1224 res = save_complete_save_html_objects(ctx, c);
1225 if (res != NSERROR_OK) {
1226 return res;
1227 }
1228
1229 return save_complete_save_html_document(ctx, c, index);
1230}
1231
1232
1233/**
1234 * Create the inventory file listing original URLs.
1235 */
1236
1238{
1239 nserror ret;
1240 FILE *fp;
1241 char *fname = NULL;
1242 save_complete_entry *entry;
1243
1244 ret = netsurf_mkpath(&fname, NULL, 2, ctx->path, "Inventory");
1245 if (ret != NSERROR_OK) {
1246 return ret;
1247 }
1248
1249 fp = fopen(fname, "w");
1250 free(fname);
1251 if (fp == NULL) {
1252 NSLOG(netsurf, INFO, "fopen(): %s", strerror(errno));
1253 return NSERROR_SAVE_FAILED;
1254 }
1255
1256 for (entry = ctx->list; entry != NULL; entry = entry->next) {
1257 fprintf(fp, "%p %s\n",
1258 entry->content,
1260 entry->content)));
1261 }
1262
1263 fclose(fp);
1264
1265 return NSERROR_OK;
1266}
1267
1268/**
1269 * Compile a regular expression, handling errors.
1270 *
1271 * Parameters as for regcomp(), see man regex.
1272 */
1273static nserror regcomp_wrapper(regex_t *preg, const char *regex, int cflags)
1274{
1275 int r;
1276 r = regcomp(preg, regex, cflags);
1277 if (r) {
1278 char errbuf[200];
1279 regerror(r, preg, errbuf, sizeof errbuf);
1280 NSLOG(netsurf, INFO, "Failed to compile regexp '%s': %s\n",
1281 regex, errbuf);
1282 return NSERROR_INIT_FAILED;
1283 }
1284 return NSERROR_OK;
1285}
1286
1287
1288/* Documented in save_complete.h */
1290{
1291 /* Match an @import rule - see CSS 2.1 G.1. */
1293 "@import" /* IMPORT_SYM */
1294 "[ \t\r\n\f]*" /* S* */
1295 /* 1 */
1296 "(" /* [ */
1297 /* 2 3 */
1298 "\"(([^\"]|[\\]\")*)\"" /* STRING (approximated) */
1299 "|"
1300 /* 4 5 */
1301 "'(([^']|[\\]')*)'"
1302 "|" /* | */
1303 "url\\([ \t\r\n\f]*" /* URI (approximated) */
1304 /* 6 7 */
1305 "\"(([^\"]|[\\]\")*)\""
1306 "[ \t\r\n\f]*\\)"
1307 "|"
1308 "url\\([ \t\r\n\f]*"
1309 /* 8 9 */
1310 "'(([^']|[\\]')*)'"
1311 "[ \t\r\n\f]*\\)"
1312 "|"
1313 "url\\([ \t\r\n\f]*"
1314 /* 10 */
1315 "([^) \t\r\n\f]*)"
1316 "[ \t\r\n\f]*\\)"
1317 ")", /* ] */
1319}
1320
1321/* Documented in save_complete.h */
1323{
1325 return NSERROR_OK;
1326}
1327
1328/* Documented in save_complete.h */
1329nserror
1331 const char *path,
1333{
1336
1337 save_complete_ctx_initialise(&ctx, path, set_type);
1338
1339 result = save_complete_save_html(&ctx, c, true);
1340
1341 if (result == NSERROR_OK) {
1343 }
1344
1346
1347 return result;
1348}
STATIC char result[100]
Definition: arexx.c:77
Helpers for ASCII string handling.
static char ascii_to_lower(char c)
Convert an upper case character to lower case.
Definition: ascii.h:212
Box interface.
static uint32_t count(const http_directive *list, lwc_string *key)
char * strndup(const char *s, size_t n)
Duplicate up to n characters of a string.
Definition: utils.c:332
@ CONTENT_NONE
no type for content
Definition: content_type.h:55
@ CONTENT_HTML
content is HTML
Definition: content_type.h:58
Useful interned string pointers (interface).
struct nscss_import * nscss_get_imports(hlcache_handle *h, uint32_t *n)
Retrieve imported stylesheets.
Definition: css.c:447
nserror
Enumeration of error codes.
Definition: errors.h:29
@ NSERROR_SAVE_FAILED
Failed to save data.
Definition: errors.h:36
@ NSERROR_INIT_FAILED
Initialisation failed.
Definition: errors.h:38
@ NSERROR_INVALID
Invalid data.
Definition: errors.h:49
@ NSERROR_NOMEM
Memory exhaustion.
Definition: errors.h:32
@ NSERROR_OK
No error.
Definition: errors.h:30
static struct directory * root
Definition: filename.c:55
const char * type
Definition: filetype.cpp:44
Interface to core interface table.
struct content * hlcache_handle_get_content(const hlcache_handle *handle)
Retrieve a content object from a cache handle.
Definition: hlcache.c:776
High-level resource cache interface.
struct html_stylesheet * html_get_stylesheets(hlcache_handle *h, unsigned int *n)
Retrieve stylesheets used by HTML document.
Definition: css.c:498
nsurl * html_get_base_url(hlcache_handle *h)
Retrieve an HTML content's base URL.
Definition: html.c:2039
dom_document * html_get_document(hlcache_handle *h)
Retrieve HTML document tree.
Definition: html.c:1953
Interface to text/html content handler.
struct content_html_object * html_get_objects(struct hlcache_handle *h, unsigned int *n)
Retrieve objects used by HTML document.
Definition: object.c:60
#define STYLESHEET_START
Definition: html.h:152
Interface to HTML content handler to save documents.
Public content interface.
struct nsurl * hlcache_handle_get_url(const struct hlcache_handle *handle)
Retrieve the URL associated with a high level cache handle.
const uint8_t * content_get_source_data(struct hlcache_handle *h, size_t *size)
Retrieve source of content.
Definition: content.c:1209
lwc_string * content_get_mime_type(struct hlcache_handle *h)
Retrieve mime-type of content.
Definition: content.c:1073
content_type content_get_type(struct hlcache_handle *h)
Retrieve computed type of content.
Definition: content.c:1061
Interface to platform-specific miscellaneous browser operation table.
#define NSLOG(catname, level, logmsg, args...)
Definition: log.h:116
Localised message support (interface).
NetSurf URL handling (interface).
bool nsurl_compare(const nsurl *url1, const nsurl *url2, nsurl_component parts)
Compare two URLs.
void nsurl_unref(nsurl *url)
Drop a reference to a NetSurf URL object.
const char * nsurl_access(const nsurl *url)
Access a NetSurf URL object as a string.
size_t nsurl_length(const nsurl *url)
Find the length of a NetSurf URL object's URL, as returned by nsurl_access.
@ NSURL_COMPLETE
Definition: nsurl.h:54
nserror nsurl_join(const nsurl *base, const char *rel, nsurl **joined)
Join a base url to a relative link part, creating a new NetSurf URL object.
struct nsurl nsurl
NetSurf URL object.
Definition: nsurl.h:31
@ base
Definition: punycode.c:19
#define REG_ICASE
Definition: regex.h:32
#define REG_EXTENDED
Definition: regex.h:31
size_t regerror(int errorcode, const regex_t *restrict preg, char *restrict errbuf, size_t errbuf_size)
Definition: utils.c:548
int regexec(const regex_t *restrict preg, const char *restrict string, size_t nmatch, regmatch_t pmatch[restrict], int eflags)
Definition: utils.c:560
void regfree(regex_t *preg)
Definition: utils.c:569
int regcomp(regex_t *restrict preg, const char *restrictregex, int cflags)
Definition: utils.c:542
static void save_complete_ctx_finalise(save_complete_ctx *ctx)
Definition: save_complete.c:91
static bool save_complete_ctx_has_content(save_complete_ctx *ctx, struct hlcache_handle *content)
void save_complete_init(void)
Initialise save complete module.
static nserror save_complete_save_html_object(save_complete_ctx *ctx, hlcache_handle *obj)
nserror save_complete(hlcache_handle *c, const char *path, save_complete_set_type_cb set_type)
Save an HTML page with all dependencies.
static bool save_complete_handle_element(save_complete_ctx *ctx, dom_node *node, save_complete_event_type event_type)
static int snregexec(regex_t *preg, const char *string, size_t stringlen, size_t nmatch, regmatch_t pmatch[], int eflags)
perform a posix regexec on a string without a null terminator
static nserror save_complete_save_buffer(save_complete_ctx *ctx, const char *leafname, const uint8_t *data, size_t data_len, lwc_string *mime_type)
static nserror save_complete_save_html_document(save_complete_ctx *ctx, hlcache_handle *c, bool index)
static bool save_complete_node_handler(dom_node *node, save_complete_event_type event_type, void *ctxin)
static bool save_complete_handle_attr_value(save_complete_ctx *ctx, dom_string *node_name, dom_string *attr_name, dom_string *attr_value)
static nserror save_complete_ctx_add_content(save_complete_ctx *ctx, struct hlcache_handle *content)
struct save_complete_entry save_complete_entry
An entry in save_complete_list.
static nserror save_complete_save_html_stylesheets(save_complete_ctx *ctx, hlcache_handle *c)
static bool save_complete_rewrite_url_value(save_complete_ctx *ctx, const char *value, size_t value_len)
static bool save_complete_handle_attrs(save_complete_ctx *ctx, dom_string *node_name, dom_namednodemap *attrs)
static nserror regcomp_wrapper(regex_t *preg, const char *regex, int cflags)
Compile a regular expression, handling errors.
static regex_t save_complete_import_re
Definition: save_complete.c:54
static nserror save_complete_save_html_stylesheet(save_complete_ctx *ctx, struct html_stylesheet *sheet)
static bool save_complete_libdom_treewalk(dom_node *root, bool(*callback)(dom_node *node, save_complete_event_type event_type, void *ctx), void *ctx)
static bool save_complete_write_value(save_complete_ctx *ctx, const char *value, size_t value_len)
save_complete_event_type
Definition: save_complete.c:72
@ EVENT_ENTER
Definition: save_complete.c:73
@ EVENT_LEAVE
Definition: save_complete.c:74
static nserror save_complete_save_html_objects(save_complete_ctx *ctx, hlcache_handle *c)
static bool save_complete_handle_attr(save_complete_ctx *ctx, dom_string *node_name, dom_attr *attr)
struct save_complete_ctx save_complete_ctx
static nserror save_complete_save_imported_sheets(save_complete_ctx *ctx, struct nscss_import *imports, uint32_t import_count)
static nserror save_complete_save_stylesheet(save_complete_ctx *ctx, hlcache_handle *css)
static uint8_t * save_complete_rewrite_stylesheet_urls(save_complete_ctx *ctx, const uint8_t *source, size_t size, const nsurl *base, size_t *osize)
Rewrite stylesheet @import rules for save complete.
static nserror save_complete_save_html(save_complete_ctx *ctx, struct hlcache_handle *c, bool index)
Save an HTML page with all dependencies, recursing through imported pages.
nserror save_complete_finalise(void)
Finalise save complete module.
static struct hlcache_handle * save_complete_ctx_find_content(save_complete_ctx *ctx, const nsurl *url)
find handle to content for url
static nserror save_complete_inventory(save_complete_ctx *ctx)
Create the inventory file listing original URLs.
static void save_complete_ctx_initialise(save_complete_ctx *ctx, const char *path, save_complete_set_type_cb set_type)
Definition: save_complete.c:83
Save HTML document with dependencies (interface).
void(* save_complete_set_type_cb)(const char *path, lwc_string *mime_type)
Callback to set type of a file.
Definition: save_complete.h:39
static css_error node_name(void *pw, void *node, css_qname *qname)
Callback to retrieve a node's name.
Definition: select.c:373
Interface to utility string handling.
An object (img, object, etc.
Definition: html.h:93
struct content_html_object * next
Next in chain.
Definition: html.h:95
struct box * box
Node in box tree containing it.
Definition: html.h:98
struct hlcache_handle * content
Content, or 0.
Definition: html.h:97
Content which corresponds to a single URL.
hlcache_entry * next
Next sibling.
Definition: hlcache.c:77
struct content * content
Pointer to associated content.
Definition: hlcache.c:75
High-level cache handle.
Definition: hlcache.c:66
hlcache_entry * entry
Pointer to cache entry.
Definition: hlcache.c:67
Container for stylesheets used by an HTML document.
Definition: html.h:58
struct hlcache_handle * sheet
Definition: html.h:60
Imported stylesheet record.
Definition: css.h:33
struct hlcache_handle * c
Content containing sheet.
Definition: css.h:34
Definition: regex.h:38
regoff_t rm_eo
Definition: regex.h:47
regoff_t rm_so
Definition: regex.h:44
save_complete_entry * list
Definition: save_complete.c:64
const char * path
Definition: save_complete.c:63
enum save_complete_ctx::@76 iter_state
save_complete_set_type_cb set_type
Definition: save_complete.c:65
An entry in save_complete_list.
Definition: save_complete.c:57
struct hlcache_handle * content
Definition: save_complete.c:58
struct save_complete_entry * next
Next entry in list.
Definition: save_complete.c:59
nserror netsurf_mkpath(char **str, size_t *size, size_t nelm,...)
Generate a path from one or more component elemnts.
Definition: file.c:288
Default operations table for files.
nserror utf8_to_html(const char *string, const char *encname, size_t len, char **result_out)
Convert a UTF-8 encoded string into a string of the given encoding, applying HTML escape sequences wh...
Definition: utf8.c:369
UTF-8 manipulation functions (interface).
Interface to a number of general purpose functionality.
#define SLEN(x)
Calculate length of constant C string.
Definition: utils.h:88
event_type
Definition: wimp_event.c:48
static nserror path(const struct redraw_context *ctx, const plot_style_t *pstyle, const float *p, unsigned int n, const float transform[6])
Plots a path.
Definition: plot.c:821
static nserror text(const struct redraw_context *ctx, const struct plot_font_style *fstyle, int x, int y, const char *text, size_t length)
Text plotting.
Definition: plot.c:978