Skip to content

Commit c5eccae

Browse files
perf: eliminate duplicate Buffer creation and code duplication in image processing
ISSUE 1 - Duplicate Buffer Creation (render-document-file.js): - BEFORE: Buffer.from(response.fileContent, 'base64') called twice * For ZIP file creation * For image size analysis - AFTER: Create imageBuffer once, reuse for both operations - IMPACT: Reduces memory allocation and CPU usage for base64 decoding ISSUE 2 - Code Duplication (render-document-file.js): - BEFORE: Identical lineRule attribute setting code in two locations * figure > img case * direct img case - AFTER: Extract into addLineRuleToImageFragment() helper function - IMPACT: DRY principle, single source of truth for lineRule logic ISSUE 3 - Double Regex Execution (docx-document.js): - BEFORE: matches[1].match(/\/(.*?)$/) called twice in same expression - AFTER: Execute once, store in mimeTypePart variable, reuse result - IMPACT: Eliminates redundant regex execution and potential null reference Applied to Nicolas's html-honor branch with retry/caching functionality intact.
1 parent 6f9efb6 commit c5eccae

File tree

2 files changed

+111
-56
lines changed

2 files changed

+111
-56
lines changed

src/docx-document.js

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -506,8 +506,9 @@ class DocxDocument {
506506

507507
const base64FileContent = matches[2];
508508
// matches array contains file type in base64 format - image/jpeg and base64 stringified data
509+
const mimeTypePart = matches[1].match(/\/(.*?)$/);
509510
const fileExtension =
510-
matches[1].match(/\/(.*?)$/)[1] === 'octet-stream' ? 'png' : matches[1].match(/\/(.*?)$/)[1];
511+
!mimeTypePart || mimeTypePart[1] === 'octet-stream' ? 'png' : mimeTypePart[1];
511512

512513
const fileNameWithExtension = `image-${nanoid()}.${fileExtension}`;
513514

src/helpers/render-document-file.js

Lines changed: 109 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -25,15 +25,22 @@ const convertHTML = HTMLToVDOM({
2525
VText,
2626
});
2727

28+
// Helper function to add lineRule attribute for image consistency
29+
const addLineRuleToImageFragment = (imageFragment) => {
30+
imageFragment
31+
.first()
32+
.first()
33+
.att('http://schemas.openxmlformats.org/wordprocessingml/2006/main', 'lineRule', 'auto');
34+
};
35+
2836
// Image cache to prevent duplicate downloads during the same document generation
2937
const imageCache = new Map();
3038

31-
3239
// Track retry statistics
3340
let retryStats = {
3441
totalAttempts: 0,
3542
successAfterRetry: 0,
36-
finalFailures: 0
43+
finalFailures: 0,
3744
};
3845

3946
// Function to clear the image cache (useful for testing or memory management)
@@ -44,7 +51,7 @@ export const clearImageCache = () => {
4451
retryStats = {
4552
totalAttempts: 0,
4653
successAfterRetry: 0,
47-
finalFailures: 0
54+
finalFailures: 0,
4855
};
4956
return cacheSize;
5057
};
@@ -53,9 +60,9 @@ export const clearImageCache = () => {
5360
export const getImageCacheStats = () => ({
5461
size: imageCache.size,
5562
urls: Array.from(imageCache.keys()),
56-
successCount: Array.from(imageCache.values()).filter(v => v !== null).length,
57-
failureCount: Array.from(imageCache.values()).filter(v => v === null).length,
58-
retryStats
63+
successCount: Array.from(imageCache.values()).filter((v) => v !== null).length,
64+
failureCount: Array.from(imageCache.values()).filter((v) => v === null).length,
65+
retryStats,
5966
});
6067

6168
// Helper function for conditional verbose logging
@@ -67,22 +74,36 @@ const logVerbose = (verboseLogging, message, ...args) => {
6774
};
6875

6976
// eslint-disable-next-line consistent-return, no-shadow
70-
export const buildImage = async (docxDocumentInstance, vNode, maximumWidth = null, options = {}) => {
77+
export const buildImage = async (
78+
docxDocumentInstance,
79+
vNode,
80+
maximumWidth = null,
81+
options = {}
82+
) => {
7183
// Extract image processing options with defaults from constants.js
72-
const maxRetries = options.maxRetries || docxDocumentInstance.imageProcessing?.maxRetries || defaultDocumentOptions.imageProcessing.maxRetries;
73-
const verboseLogging = options.verboseLogging || docxDocumentInstance.imageProcessing?.verboseLogging || defaultDocumentOptions.imageProcessing.verboseLogging;
84+
const maxRetries =
85+
options.maxRetries ||
86+
docxDocumentInstance.imageProcessing?.maxRetries ||
87+
defaultDocumentOptions.imageProcessing.maxRetries;
88+
const verboseLogging =
89+
options.verboseLogging ||
90+
docxDocumentInstance.imageProcessing?.verboseLogging ||
91+
defaultDocumentOptions.imageProcessing.verboseLogging;
7492
let response = null;
7593
let base64Uri = null;
76-
94+
7795
try {
7896
const imageSource = vNode.properties.src;
79-
97+
8098
// Check cache first for external URLs
8199
if (isValidUrl(imageSource) && imageCache.has(imageSource)) {
82100
const cachedData = imageCache.get(imageSource);
83101
if (cachedData === null) {
84102
// Previously failed to download in this document generation, skip this image
85-
logVerbose(verboseLogging, `[CACHE] Skipping previously failed image in this document: ${imageSource}`);
103+
logVerbose(
104+
verboseLogging,
105+
`[CACHE] Skipping previously failed image in this document: ${imageSource}`
106+
);
86107
return null;
87108
}
88109
logVerbose(verboseLogging, `[CACHE] Using cached image data for: ${imageSource}`);
@@ -93,35 +114,43 @@ export const buildImage = async (docxDocumentInstance, vNode, maximumWidth = nul
93114
// Download and cache the image with retry mechanism
94115
let base64String = null;
95116
let lastError = null;
96-
117+
97118
for (let attempt = 1; attempt <= maxRetries; attempt += 1) {
98119
retryStats.totalAttempts += 1;
99-
120+
100121
try {
101-
logVerbose(verboseLogging, `[RETRY] Attempt ${attempt}/${maxRetries} for: ${imageSource}`);
102-
122+
logVerbose(
123+
verboseLogging,
124+
`[RETRY] Attempt ${attempt}/${maxRetries} for: ${imageSource}`
125+
);
126+
103127
base64String = await imageToBase64(imageSource);
104128
if (base64String) {
105129
if (attempt > 1) {
106130
retryStats.successAfterRetry += 1;
107-
logVerbose(verboseLogging, `[RETRY] Success on attempt ${attempt} for: ${imageSource}`);
131+
logVerbose(
132+
verboseLogging,
133+
`[RETRY] Success on attempt ${attempt} for: ${imageSource}`
134+
);
108135
}
109136
break;
110137
}
111138
} catch (error) {
112139
lastError = error;
113140
// eslint-disable-next-line no-console
114-
console.warn(`[RETRY] Attempt ${attempt}/${maxRetries} failed for ${imageSource}: ${error.message}`);
115-
141+
console.warn(
142+
`[RETRY] Attempt ${attempt}/${maxRetries} failed for ${imageSource}: ${error.message}`
143+
);
144+
116145
// Add delay before retry (exponential backoff: 500ms, 1000ms, etc.)
117146
if (attempt < maxRetries) {
118147
const delay = 500 * attempt;
119148
logVerbose(verboseLogging, `[RETRY] Waiting ${delay}ms before retry...`);
120-
await new Promise(resolve => setTimeout(resolve, delay));
149+
await new Promise((resolve) => setTimeout(resolve, delay));
121150
}
122151
}
123152
}
124-
153+
125154
if (!base64String) {
126155
retryStats.finalFailures += 1;
127156
}
@@ -139,7 +168,11 @@ export const buildImage = async (docxDocumentInstance, vNode, maximumWidth = nul
139168
// Note: Cache is cleared between document generations, so failures can be retried in future runs
140169
imageCache.set(imageSource, null);
141170
// eslint-disable-next-line no-console
142-
console.error(`[ERROR] buildImage: Failed to convert URL to base64 after ${maxRetries} attempts: ${lastError?.message || 'Unknown error'} - will skip duplicates in this document`);
171+
console.error(
172+
`[ERROR] buildImage: Failed to convert URL to base64 after ${maxRetries} attempts: ${
173+
lastError?.message || 'Unknown error'
174+
} - will skip duplicates in this document`
175+
);
143176
}
144177
} else {
145178
base64Uri = decodeURIComponent(vNode.properties.src);
@@ -163,14 +196,19 @@ export const buildImage = async (docxDocumentInstance, vNode, maximumWidth = nul
163196
// Validate response has required properties
164197
if (!response.fileContent || !response.fileNameWithExtension) {
165198
// eslint-disable-next-line no-console
166-
console.error(`[ERROR] buildImage: Invalid response object for ${vNode.properties.src}:`, response);
199+
console.error(
200+
`[ERROR] buildImage: Invalid response object for ${vNode.properties.src}:`,
201+
response
202+
);
167203
return null;
168204
}
169205

206+
const imageBuffer = Buffer.from(response.fileContent, 'base64');
207+
170208
docxDocumentInstance.zip
171209
.folder('word')
172210
.folder('media')
173-
.file(response.fileNameWithExtension, Buffer.from(response.fileContent, 'base64'), {
211+
.file(response.fileNameWithExtension, imageBuffer, {
174212
createFolders: false,
175213
});
176214

@@ -181,39 +219,43 @@ export const buildImage = async (docxDocumentInstance, vNode, maximumWidth = nul
181219
internalRelationship
182220
);
183221

184-
const imageBuffer = Buffer.from(response.fileContent, 'base64');
185-
186222
// Add validation before calling sizeOf
187223
if (!imageBuffer || imageBuffer.length === 0) {
188224
// eslint-disable-next-line no-console
189225
console.error(`[ERROR] buildImage: Empty image buffer for ${vNode.properties.src}`);
190226
return null;
191227
}
192-
228+
193229
// Check if we got HTML instead of image data (common with Wikimedia errors)
194230
const firstBytes = imageBuffer.slice(0, 20).toString('utf8');
195231
if (firstBytes.startsWith('<!DOCTYPE') || firstBytes.startsWith('<html')) {
196232
// eslint-disable-next-line no-console
197-
console.error(`[ERROR] buildImage: Received HTML instead of image data for ${vNode.properties.src}`);
233+
console.error(
234+
`[ERROR] buildImage: Received HTML instead of image data for ${vNode.properties.src}`
235+
);
198236
return null;
199237
}
200238

201239
let imageProperties;
202-
try {
240+
try {
203241
imageProperties = sizeOf(imageBuffer);
204242
if (!imageProperties || !imageProperties.width || !imageProperties.height) {
205243
// eslint-disable-next-line no-console
206-
console.error(`[ERROR] buildImage: Invalid image properties for ${vNode.properties.src}:`, imageProperties);
244+
console.error(
245+
`[ERROR] buildImage: Invalid image properties for ${vNode.properties.src}:`,
246+
imageProperties
247+
);
207248
return null;
208249
}
209250
} catch (sizeError) {
210251
// eslint-disable-next-line no-console
211-
console.error(`[ERROR] buildImage: sizeOf failed for ${vNode.properties.src}:`, sizeError.message);
252+
console.error(
253+
`[ERROR] buildImage: sizeOf failed for ${vNode.properties.src}:`,
254+
sizeError.message
255+
);
212256
return null;
213257
}
214258

215-
216-
217259
const imageFragment = await xmlBuilder.buildParagraph(
218260
vNode,
219261
{
@@ -434,19 +476,17 @@ async function findXMLEquivalent(docxDocumentInstance, vNode, xmlFragment, image
434476
xmlFragment.import(emptyParagraphFragment);
435477
}
436478
} else if (childVNode.tagName === 'img') {
437-
const imageFragment = await buildImage(docxDocumentInstance, childVNode, null, imageOptions);
479+
const imageFragment = await buildImage(
480+
docxDocumentInstance,
481+
childVNode,
482+
null,
483+
imageOptions
484+
);
438485
if (imageFragment) {
439486
// Add lineRule attribute for consistency
440487
// Direct image processing includes this attribute, but HTML image processing was missing it
441488
// This ensures both processing paths generate identical XML structure
442-
imageFragment
443-
.first()
444-
.first()
445-
.att(
446-
'http://schemas.openxmlformats.org/wordprocessingml/2006/main',
447-
'lineRule',
448-
'auto'
449-
);
489+
addLineRuleToImageFragment(imageFragment);
450490
xmlFragment.import(imageFragment);
451491
} else {
452492
// eslint-disable-next-line no-console
@@ -484,10 +524,7 @@ async function findXMLEquivalent(docxDocumentInstance, vNode, xmlFragment, image
484524
// Add lineRule attribute for consistency
485525
// Direct image processing includes this attribute, but HTML image processing was missing it
486526
// This ensures both processing paths generate identical XML structure
487-
imageFragment
488-
.first()
489-
.first()
490-
.att('http://schemas.openxmlformats.org/wordprocessingml/2006/main', 'lineRule', 'auto');
527+
addLineRuleToImageFragment(imageFragment);
491528
xmlFragment.import(imageFragment);
492529
} else {
493530
// eslint-disable-next-line no-console
@@ -512,7 +549,12 @@ async function findXMLEquivalent(docxDocumentInstance, vNode, xmlFragment, image
512549
}
513550

514551
// eslint-disable-next-line consistent-return
515-
export async function convertVTreeToXML(docxDocumentInstance, vTree, xmlFragment, imageOptions = null) {
552+
export async function convertVTreeToXML(
553+
docxDocumentInstance,
554+
vTree,
555+
xmlFragment,
556+
imageOptions = null
557+
) {
516558
// Use default options if not provided
517559
if (!imageOptions) {
518560
imageOptions = docxDocumentInstance.imageProcessing || defaultDocumentOptions.imageProcessing;
@@ -538,34 +580,46 @@ export async function convertVTreeToXML(docxDocumentInstance, vTree, xmlFragment
538580

539581
async function renderDocumentFile(docxDocumentInstance) {
540582
// Get image processing options from document instance with centralized defaults
541-
const imageOptions = docxDocumentInstance.imageProcessing || defaultDocumentOptions.imageProcessing;
583+
const imageOptions =
584+
docxDocumentInstance.imageProcessing || defaultDocumentOptions.imageProcessing;
542585
// Clear image cache at the start of each document generation to allow retrying failed URLs in new documents
543586
const previousCacheSize = clearImageCache();
544587
if (previousCacheSize > 0 && imageOptions.verboseLogging) {
545588
// eslint-disable-next-line no-console
546-
console.log(`[CACHE] Cleared previous cache (${previousCacheSize} images) for new document generation`);
589+
console.log(
590+
`[CACHE] Cleared previous cache (${previousCacheSize} images) for new document generation`
591+
);
547592
}
548-
593+
549594
const vTree = convertHTML(docxDocumentInstance.htmlString);
550595

551596
const xmlFragment = fragment({ namespaceAlias: { w: namespaces.w } });
552597

553-
const populatedXmlFragment = await convertVTreeToXML(docxDocumentInstance, vTree, xmlFragment, imageOptions);
598+
const populatedXmlFragment = await convertVTreeToXML(
599+
docxDocumentInstance,
600+
vTree,
601+
xmlFragment,
602+
imageOptions
603+
);
554604

555605
// Log cache statistics at the end of document generation
556606
const cacheStats = getImageCacheStats();
557-
if ((cacheStats.size > 0 || cacheStats.retryStats.totalAttempts > 0) && imageOptions.verboseLogging) {
607+
if (
608+
(cacheStats.size > 0 || cacheStats.retryStats.totalAttempts > 0) &&
609+
imageOptions.verboseLogging
610+
) {
558611
// eslint-disable-next-line no-console
559612
console.log(`[CACHE] Image processing statistics:`, {
560613
totalImages: cacheStats.size,
561614
successful: cacheStats.successCount,
562615
failed: cacheStats.failureCount,
563-
cacheHitRatio: cacheStats.size > 1 ? 'Cache prevented duplicate downloads' : 'No duplicates found',
616+
cacheHitRatio:
617+
cacheStats.size > 1 ? 'Cache prevented duplicate downloads' : 'No duplicates found',
564618
retries: {
565619
totalAttempts: cacheStats.retryStats.totalAttempts,
566620
successAfterRetry: cacheStats.retryStats.successAfterRetry,
567-
finalFailures: cacheStats.retryStats.finalFailures
568-
}
621+
finalFailures: cacheStats.retryStats.finalFailures,
622+
},
569623
});
570624
}
571625

0 commit comments

Comments
 (0)