NetSurf
save_text.c
Go to the documentation of this file.
1/*
2 * Copyright 2004 John M Bell <jmb202@ecs.soton.ac.uk>
3 * Copyright 2008 Michael Drake <tlsa@netsurf-browser.org>
4 *
5 * This file is part of NetSurf, http://www.netsurf-browser.org/
6 *
7 * NetSurf is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; version 2 of the License.
10 *
11 * NetSurf is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20/** \file
21 * Text export of HTML (implementation).
22 */
23
24#include <assert.h>
25#include <stdbool.h>
26#include <string.h>
27
28#include <dom/dom.h>
29
30#include "utils/config.h"
31#include "utils/log.h"
32#include "utils/utf8.h"
33#include "utils/utils.h"
34#include "netsurf/content.h"
35#include "html/box.h"
36#include "html/html_save.h"
37
38#include "netsurf/utf8.h"
40#include "desktop/save_text.h"
41
42static void extract_text(struct box *box, bool *first,
43 save_text_whitespace *before, struct save_text_state *save);
44static bool save_text_add_to_buffer(const char *text, size_t length,
45 struct box *box, const char *whitespace_text,
46 size_t whitespace_length, struct save_text_state *save);
47
48
49/**
50 * Extract the text from an HTML content and save it as a text file. Text is
51 * converted to the local encoding.
52 *
53 * \param c An HTML content.
54 * \param path Path to save text file too.
55 */
56
57void save_as_text(struct hlcache_handle *c, char *path)
58{
59 FILE *out;
60 struct save_text_state save = { NULL, 0, 0 };
62 bool first = true;
63 nserror ret;
64 char *result;
65
66 if (!c || content_get_type(c) != CONTENT_HTML) {
67 return;
68 }
69
70 extract_text(html_get_box_tree(c), &first, &before, &save);
71 if (!save.block)
72 return;
73
74 ret = guit->utf8->utf8_to_local(save.block, save.length, &result);
75 free(save.block);
76
77 if (ret != NSERROR_OK) {
78 NSLOG(netsurf, INFO,
79 "failed to convert to local encoding, return %d", ret);
80 return;
81 }
82
83 out = fopen(path, "w");
84 if (out) {
85 int res = fputs(result, out);
86
87 if (res < 0) {
88 NSLOG(netsurf, INFO, "Warning: write failed");
89 }
90
91 res = fputs("\n", out);
92 if (res < 0) {
93 NSLOG(netsurf, INFO,
94 "Warning: failed writing trailing newline");
95 }
96
97 fclose(out);
98 }
99
100 free(result);
101}
102
103
104/**
105 * Decide what whitespace to place before the next bit of content-related text
106 * that is saved. Any existing whitespace is overridden if the whitespace for
107 * this box is more "significant".
108 *
109 * \param box Pointer to box.
110 * \param first Whether this is before the first bit of content-related
111 * text to be saved.
112 * \param before Type of whitespace currently intended to be placed
113 * before the next bit of content-related text to be saved.
114 * Updated if this box is worthy of more significant
115 * whitespace.
116 * \param whitespace_text Whitespace to place before next bit of
117 * content-related text to be saved.
118 * Updated if this box is worthy of more significant
119 * whitespace.
120 * \param whitespace_length Length of whitespace_text.
121 * Updated if this box is worthy of more significant
122 * whitespace.
123 */
124
125void save_text_solve_whitespace(struct box *box, bool *first,
126 save_text_whitespace *before, const char **whitespace_text,
127 size_t *whitespace_length)
128{
129 /* work out what whitespace should be placed before the next bit of
130 * text */
131 if (*before < WHITESPACE_TWO_NEW_LINES &&
132 /* significant box type */
133 (box->type == BOX_BLOCK ||
134 box->type == BOX_TABLE ||
135 box->type == BOX_FLOAT_LEFT ||
136 box->type == BOX_FLOAT_RIGHT) &&
137 /* and not a list element */
138 !box->list_marker &&
139 /* and not a marker... */
140 (!(box->parent && box->parent->list_marker == box) ||
141 /* ...unless marker follows WHITESPACE_TAB */
142 ((box->parent && box->parent->list_marker == box) &&
143 *before == WHITESPACE_TAB))) {
144 *before = WHITESPACE_TWO_NEW_LINES;
145 } else if (*before <= WHITESPACE_ONE_NEW_LINE &&
146 (box->type == BOX_TABLE_ROW ||
147 box->type == BOX_BR ||
148 (box->type != BOX_INLINE &&
149 (box->parent && box->parent->list_marker == box)) ||
150 (box->parent && box->parent->style &&
151 (css_computed_white_space(box->parent->style) ==
152 CSS_WHITE_SPACE_PRE ||
153 css_computed_white_space(box->parent->style) ==
154 CSS_WHITE_SPACE_PRE_WRAP) &&
156 if (*before == WHITESPACE_ONE_NEW_LINE)
157 *before = WHITESPACE_TWO_NEW_LINES;
158 else
159 *before = WHITESPACE_ONE_NEW_LINE;
160 }
161 else if (*before < WHITESPACE_TAB &&
162 (box->type == BOX_TABLE_CELL ||
163 box->list_marker)) {
164 *before = WHITESPACE_TAB;
165 }
166
167 if (*first) {
168 /* before the first bit of text to be saved; there is
169 * no preceding whitespace */
170 *whitespace_text = "";
171 *whitespace_length = 0;
172 } else {
173 /* set the whitespace that has been decided on */
174 switch (*before) {
176 *whitespace_text = "\n\n";
177 *whitespace_length = 2;
178 break;
180 *whitespace_text = "\n";
181 *whitespace_length = 1;
182 break;
183 case WHITESPACE_TAB:
184 *whitespace_text = "\t";
185 *whitespace_length = 1;
186 break;
187 case WHITESPACE_NONE:
188 *whitespace_text = "";
189 *whitespace_length = 0;
190 break;
191 default:
192 *whitespace_text = "";
193 *whitespace_length = 0;
194 break;
195 }
196 }
197}
198
199
200/**
201 * Traverse though the box tree and add all text to a save buffer.
202 *
203 * \param box Pointer to box.
204 * \param first Whether this is before the first bit of content-related
205 * text to be saved.
206 * \param before Type of whitespace currently intended to be placed
207 * before the next bit of content-related text to be saved.
208 * Updated if this box is worthy of more significant
209 * whitespace.
210 * \param save our save_text_state workspace pointer
211 * \return true iff the file writing succeeded and traversal should continue.
212 */
213
214void extract_text(struct box *box, bool *first, save_text_whitespace *before,
215 struct save_text_state *save)
216{
217 struct box *child;
218 const char *whitespace_text = "";
219 size_t whitespace_length = 0;
220
221 assert(box);
222
223 /* If box has a list marker */
224 if (box->list_marker) {
225 /* do the marker box before continuing with the rest of the
226 * list element */
227 extract_text(box->list_marker, first, before, save);
228 }
229
230 /* read before calling the handler in case it modifies the tree */
231 child = box->children;
232
233 save_text_solve_whitespace(box, first, before, &whitespace_text,
234 &whitespace_length);
235
236 if (box->type != BOX_BR && !((box->type == BOX_FLOAT_LEFT ||
237 box->type == BOX_FLOAT_RIGHT) && !box->text) &&
238 box->length > 0 && box->text) {
239 /* Box meets criteria for export; add text to buffer */
241 whitespace_text, whitespace_length, save);
242 *first = false;
243 *before = WHITESPACE_NONE;
244 }
245
246 /* Work though the children of this box, extracting any text */
247 while (child) {
248 extract_text(child, first, before, save);
249 child = child->next;
250 }
251
252 return;
253}
254
255
256/**
257 * Add text to save text buffer. Any preceding whitespace or following space is
258 * also added to the buffer.
259 *
260 * \param text Pointer to text being added.
261 * \param length Length of text to be appended (bytes).
262 * \param box Pointer to text box.
263 * \param whitespace_text Whitespace to place before text for formatting
264 * may be NULL.
265 * \param whitespace_length Length of whitespace_text.
266 * \param save Our save_text_state workspace pointer.
267 * \return true iff the file writing succeeded and traversal should continue.
268 */
269
270bool save_text_add_to_buffer(const char *text, size_t length, struct box *box,
271 const char *whitespace_text, size_t whitespace_length,
272 struct save_text_state *save)
273{
274 size_t new_length;
275 int space = 0;
276
277 assert(save);
278
279 if (box->space > 0)
280 space = 1;
281
282 if (whitespace_text)
283 length += whitespace_length;
284
285 new_length = save->length + whitespace_length + length + space;
286 if (new_length >= save->alloc) {
287 size_t new_alloc = save->alloc + (save->alloc / 4);
288 char *new_block;
289
290 if (new_alloc < new_length) new_alloc = new_length;
291
292 new_block = realloc(save->block, new_alloc);
293 if (!new_block) return false;
294
295 save->block = new_block;
296 save->alloc = new_alloc;
297 }
298 if (whitespace_text) {
299 memcpy(save->block + save->length, whitespace_text,
300 whitespace_length);
301 }
302 memcpy(save->block + save->length + whitespace_length, text, length);
303 save->length += length;
304
305 if (space == 1)
306 save->block[save->length++] = ' ';
307
308 return true;
309}
STATIC char result[100]
Definition: arexx.c:77
Box interface.
@ BOX_BLOCK
Definition: box.h:56
@ BOX_FLOAT_LEFT
Definition: box.h:63
@ BOX_FLOAT_RIGHT
Definition: box.h:64
@ BOX_INLINE_CONTAINER
Definition: box.h:57
@ BOX_TABLE_CELL
Definition: box.h:61
@ BOX_TABLE
Definition: box.h:59
@ BOX_INLINE
Definition: box.h:58
@ BOX_TABLE_ROW
Definition: box.h:60
@ BOX_BR
Definition: box.h:66
@ CONTENT_HTML
content is HTML
Definition: content_type.h:58
nserror
Enumeration of error codes.
Definition: errors.h:29
@ NSERROR_OK
No error.
Definition: errors.h:30
struct netsurf_table * guit
The global interface table.
Definition: gui_factory.c:49
Interface to core interface table.
struct box * html_get_box_tree(hlcache_handle *h)
Retrieve box tree.
Definition: html.c:1971
Interface to HTML content handler to save documents.
Public content interface.
content_type content_get_type(struct hlcache_handle *h)
Retrieve computed type of content.
Definition: content.c:1061
Interface to platform-specific utf8 operations.
#define NSLOG(catname, level, logmsg, args...)
Definition: log.h:116
void save_as_text(struct hlcache_handle *c, char *path)
Extract the text from an HTML content and save it as a text file.
Definition: save_text.c:57
void save_text_solve_whitespace(struct box *box, bool *first, save_text_whitespace *before, const char **whitespace_text, size_t *whitespace_length)
Decide what whitespace to place before the next bit of content-related text that is saved.
Definition: save_text.c:125
static bool save_text_add_to_buffer(const char *text, size_t length, struct box *box, const char *whitespace_text, size_t whitespace_length, struct save_text_state *save)
Add text to save text buffer.
Definition: save_text.c:270
static void extract_text(struct box *box, bool *first, save_text_whitespace *before, struct save_text_state *save)
Traverse though the box tree and add all text to a save buffer.
Definition: save_text.c:214
Text export of HTML (interface).
save_text_whitespace
Definition: save_text.h:37
@ WHITESPACE_TWO_NEW_LINES
Definition: save_text.h:41
@ WHITESPACE_ONE_NEW_LINE
Definition: save_text.h:40
@ WHITESPACE_NONE
Definition: save_text.h:38
@ WHITESPACE_TAB
Definition: save_text.h:39
Interface to utility string handling.
Node in box tree.
Definition: box.h:177
struct box * parent
Parent box, or NULL.
Definition: box.h:236
struct box * children
First child box, or NULL.
Definition: box.h:226
struct box * list_marker
List marker box if this is a list-item, or NULL.
Definition: box.h:417
struct box * next
Next sibling box, or NULL.
Definition: box.h:216
box_type type
Type of box.
Definition: box.h:181
css_computed_style * style
Style for this box.
Definition: box.h:205
size_t length
Length of text.
Definition: box.h:360
char * text
Text, or NULL if none.
Definition: box.h:355
int space
Width of space after current text (depends on font and size).
Definition: box.h:365
nserror(* utf8_to_local)(const char *string, size_t len, char **result)
Convert a UTF-8 encoded string into the system local encoding.
Definition: utf8.h:40
High-level cache handle.
Definition: hlcache.c:66
struct gui_utf8_table * utf8
UTF8 table.
Definition: gui_table.h:106
size_t length
Definition: save_text.h:33
char * block
Definition: save_text.h:32
size_t alloc
Definition: save_text.h:34
UTF-8 manipulation functions (interface).
Interface to a number of general purpose functionality.
static nserror path(const struct redraw_context *ctx, const plot_style_t *pstyle, const float *p, unsigned int n, const float transform[6])
Plots a path.
Definition: plot.c:821
static nserror text(const struct redraw_context *ctx, const struct plot_font_style *fstyle, int x, int y, const char *text, size_t length)
Text plotting.
Definition: plot.c:978