<?xml version="1.0"?>
<feed xmlns="http://www.w3.org/2005/Atom" xml:lang="en">
	<id>http://gisaxs.com/index.php?action=history&amp;feed=atom&amp;title=Data_Extraction</id>
	<title>Data Extraction - Revision history</title>
	<link rel="self" type="application/atom+xml" href="http://gisaxs.com/index.php?action=history&amp;feed=atom&amp;title=Data_Extraction"/>
	<link rel="alternate" type="text/html" href="http://gisaxs.com/index.php?title=Data_Extraction&amp;action=history"/>
	<updated>2026-04-08T16:48:06Z</updated>
	<subtitle>Revision history for this page on the wiki</subtitle>
	<generator>MediaWiki 1.31.7</generator>
	<entry>
		<id>http://gisaxs.com/index.php?title=Data_Extraction&amp;diff=8730&amp;oldid=prev</id>
		<title>KevinYager: /* Optical character recognition (OCR) */</title>
		<link rel="alternate" type="text/html" href="http://gisaxs.com/index.php?title=Data_Extraction&amp;diff=8730&amp;oldid=prev"/>
		<updated>2026-03-16T15:24:13Z</updated>

		<summary type="html">&lt;p&gt;‎&lt;span dir=&quot;auto&quot;&gt;&lt;span class=&quot;autocomment&quot;&gt;Optical character recognition (OCR)&lt;/span&gt;&lt;/span&gt;&lt;/p&gt;
&lt;table class=&quot;diff diff-contentalign-left&quot; data-mw=&quot;interface&quot;&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;tr class=&quot;diff-title&quot; lang=&quot;en&quot;&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;← Older revision&lt;/td&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;Revision as of 15:24, 16 March 2026&lt;/td&gt;
				&lt;/tr&gt;&lt;tr&gt;&lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot; id=&quot;mw-diff-left-l86&quot; &gt;Line 86:&lt;/td&gt;
&lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot;&gt;Line 86:&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;* [https://reducto.ai/blog/introducing-rolmocr-open-source-ocr-model RolmOCR] ([https://huggingface.co/reducto/RolmOCR hf])&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;* [https://reducto.ai/blog/introducing-rolmocr-open-source-ocr-model RolmOCR] ([https://huggingface.co/reducto/RolmOCR hf])&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;* [https://hunyuan.tencent.com/vision/zh?tabIndex=0 HunyuanOCR]&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;* [https://hunyuan.tencent.com/vision/zh?tabIndex=0 HunyuanOCR]&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td colspan=&quot;2&quot;&gt;&amp;#160;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;+&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #a3d3ff; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&lt;ins style=&quot;font-weight: bold; text-decoration: none;&quot;&gt;* 2026-03: [https://arxiv.org/abs/2603.13032 Multimodal OCR: Parse Anything from Documents] ([https://huggingface.co/papers/2603.13032 hf])&lt;/ins&gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;=Information Extraction=&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;=Information Extraction=&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;* 2025-07: Google [https://github.com/google/langextract LangExtract]&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;* 2025-07: Google [https://github.com/google/langextract LangExtract]&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;/table&gt;</summary>
		<author><name>KevinYager</name></author>
		
	</entry>
	<entry>
		<id>http://gisaxs.com/index.php?title=Data_Extraction&amp;diff=8438&amp;oldid=prev</id>
		<title>KevinYager: /* Optical character recognition (OCR) */</title>
		<link rel="alternate" type="text/html" href="http://gisaxs.com/index.php?title=Data_Extraction&amp;diff=8438&amp;oldid=prev"/>
		<updated>2025-12-04T18:52:12Z</updated>

		<summary type="html">&lt;p&gt;‎&lt;span dir=&quot;auto&quot;&gt;&lt;span class=&quot;autocomment&quot;&gt;Optical character recognition (OCR)&lt;/span&gt;&lt;/span&gt;&lt;/p&gt;
&lt;table class=&quot;diff diff-contentalign-left&quot; data-mw=&quot;interface&quot;&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;tr class=&quot;diff-title&quot; lang=&quot;en&quot;&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;← Older revision&lt;/td&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;Revision as of 18:52, 4 December 2025&lt;/td&gt;
				&lt;/tr&gt;&lt;tr&gt;&lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot; id=&quot;mw-diff-left-l85&quot; &gt;Line 85:&lt;/td&gt;
&lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot;&gt;Line 85:&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;* [https://mistral.ai/news/mistral-ocr Mistral OCR]&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;* [https://mistral.ai/news/mistral-ocr Mistral OCR]&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;* [https://reducto.ai/blog/introducing-rolmocr-open-source-ocr-model RolmOCR] ([https://huggingface.co/reducto/RolmOCR hf])&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;* [https://reducto.ai/blog/introducing-rolmocr-open-source-ocr-model RolmOCR] ([https://huggingface.co/reducto/RolmOCR hf])&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td colspan=&quot;2&quot;&gt;&amp;#160;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;+&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #a3d3ff; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&lt;ins style=&quot;font-weight: bold; text-decoration: none;&quot;&gt;* [https://hunyuan.tencent.com/vision/zh?tabIndex=0 HunyuanOCR]&lt;/ins&gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;=Information Extraction=&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;=Information Extraction=&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;* 2025-07: Google [https://github.com/google/langextract LangExtract]&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;* 2025-07: Google [https://github.com/google/langextract LangExtract]&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;/table&gt;</summary>
		<author><name>KevinYager</name></author>
		
	</entry>
	<entry>
		<id>http://gisaxs.com/index.php?title=Data_Extraction&amp;diff=8125&amp;oldid=prev</id>
		<title>KevinYager: /* Document Parsing */</title>
		<link rel="alternate" type="text/html" href="http://gisaxs.com/index.php?title=Data_Extraction&amp;diff=8125&amp;oldid=prev"/>
		<updated>2025-07-31T12:07:26Z</updated>

		<summary type="html">&lt;p&gt;‎&lt;span dir=&quot;auto&quot;&gt;&lt;span class=&quot;autocomment&quot;&gt;Document Parsing&lt;/span&gt;&lt;/span&gt;&lt;/p&gt;
&lt;table class=&quot;diff diff-contentalign-left&quot; data-mw=&quot;interface&quot;&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;tr class=&quot;diff-title&quot; lang=&quot;en&quot;&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;← Older revision&lt;/td&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;Revision as of 12:07, 31 July 2025&lt;/td&gt;
				&lt;/tr&gt;&lt;tr&gt;&lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot; id=&quot;mw-diff-left-l85&quot; &gt;Line 85:&lt;/td&gt;
&lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot;&gt;Line 85:&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;* [https://mistral.ai/news/mistral-ocr Mistral OCR]&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;* [https://mistral.ai/news/mistral-ocr Mistral OCR]&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;* [https://reducto.ai/blog/introducing-rolmocr-open-source-ocr-model RolmOCR] ([https://huggingface.co/reducto/RolmOCR hf])&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;* [https://reducto.ai/blog/introducing-rolmocr-open-source-ocr-model RolmOCR] ([https://huggingface.co/reducto/RolmOCR hf])&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td colspan=&quot;2&quot;&gt;&amp;#160;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;+&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #a3d3ff; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&lt;ins style=&quot;font-weight: bold; text-decoration: none;&quot;&gt;&lt;/ins&gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td colspan=&quot;2&quot;&gt;&amp;#160;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;+&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #a3d3ff; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&lt;ins style=&quot;font-weight: bold; text-decoration: none;&quot;&gt;=Information Extraction=&lt;/ins&gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td colspan=&quot;2&quot;&gt;&amp;#160;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;+&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #a3d3ff; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&lt;ins style=&quot;font-weight: bold; text-decoration: none;&quot;&gt;* 2025-07: Google [https://github.com/google/langextract LangExtract]&lt;/ins&gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;/table&gt;</summary>
		<author><name>KevinYager</name></author>
		
	</entry>
	<entry>
		<id>http://gisaxs.com/index.php?title=Data_Extraction&amp;diff=8109&amp;oldid=prev</id>
		<title>KevinYager: /* PDF Conversion */</title>
		<link rel="alternate" type="text/html" href="http://gisaxs.com/index.php?title=Data_Extraction&amp;diff=8109&amp;oldid=prev"/>
		<updated>2025-07-27T17:55:25Z</updated>

		<summary type="html">&lt;p&gt;‎&lt;span dir=&quot;auto&quot;&gt;&lt;span class=&quot;autocomment&quot;&gt;PDF Conversion&lt;/span&gt;&lt;/span&gt;&lt;/p&gt;
&lt;table class=&quot;diff diff-contentalign-left&quot; data-mw=&quot;interface&quot;&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;tr class=&quot;diff-title&quot; lang=&quot;en&quot;&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;← Older revision&lt;/td&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;Revision as of 17:55, 27 July 2025&lt;/td&gt;
				&lt;/tr&gt;&lt;tr&gt;&lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot; id=&quot;mw-diff-left-l67&quot; &gt;Line 67:&lt;/td&gt;
&lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot;&gt;Line 67:&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;* [https://olmocr.allenai.org/blog olmOCR] ([https://olmocr.allenai.org/ online demo], [https://github.com/allenai/olmocr code], [https://huggingface.co/collections/allenai/olmocr-67af8630b0062a25bf1b54a1 hf])&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;* [https://olmocr.allenai.org/blog olmOCR] ([https://olmocr.allenai.org/ online demo], [https://github.com/allenai/olmocr code], [https://huggingface.co/collections/allenai/olmocr-67af8630b0062a25bf1b54a1 hf])&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;* [https://mistral.ai/news/mistral-ocr Mistral OCR]&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;* [https://mistral.ai/news/mistral-ocr Mistral OCR]&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td colspan=&quot;2&quot;&gt;&amp;#160;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;+&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #a3d3ff; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&lt;ins style=&quot;font-weight: bold; text-decoration: none;&quot;&gt;* [https://github.com/MarkPDFdown/markpdfdown MarkPDFDown]&lt;/ins&gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;==PDF Language Translation==&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;==PDF Language Translation==&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;/table&gt;</summary>
		<author><name>KevinYager</name></author>
		
	</entry>
	<entry>
		<id>http://gisaxs.com/index.php?title=Data_Extraction&amp;diff=7757&amp;oldid=prev</id>
		<title>KevinYager: /* Document Parsing */</title>
		<link rel="alternate" type="text/html" href="http://gisaxs.com/index.php?title=Data_Extraction&amp;diff=7757&amp;oldid=prev"/>
		<updated>2025-05-12T12:18:37Z</updated>

		<summary type="html">&lt;p&gt;‎&lt;span dir=&quot;auto&quot;&gt;&lt;span class=&quot;autocomment&quot;&gt;Document Parsing&lt;/span&gt;&lt;/span&gt;&lt;/p&gt;
&lt;table class=&quot;diff diff-contentalign-left&quot; data-mw=&quot;interface&quot;&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;tr class=&quot;diff-title&quot; lang=&quot;en&quot;&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;← Older revision&lt;/td&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;Revision as of 12:18, 12 May 2025&lt;/td&gt;
				&lt;/tr&gt;&lt;tr&gt;&lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot; id=&quot;mw-diff-left-l43&quot; &gt;Line 43:&lt;/td&gt;
&lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot;&gt;Line 43:&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;=Document Parsing=&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;=Document Parsing=&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td colspan=&quot;2&quot;&gt;&amp;#160;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;+&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #a3d3ff; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&lt;ins style=&quot;font-weight: bold; text-decoration: none;&quot;&gt;* [https://idp-leaderboard.org/ Intelligent Document Processing Leaderboard] A unified leaderboard for OCR, KIE, classification, QA, table extraction, and confidence score evaluation&lt;/ins&gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td colspan=&quot;2&quot;&gt;&amp;#160;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;+&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #a3d3ff; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&lt;ins style=&quot;font-weight: bold; text-decoration: none;&quot;&gt;&lt;/ins&gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td colspan=&quot;2&quot;&gt;&amp;#160;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;+&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #a3d3ff; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&lt;ins style=&quot;font-weight: bold; text-decoration: none;&quot;&gt;==Systems==&lt;/ins&gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;* [https://github.com/DS4SD/docling Docling]: converts multiple formats (PDF, DOCX, PPTX, Images, HTML) into Markdown and JSON&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;* [https://github.com/DS4SD/docling Docling]: converts multiple formats (PDF, DOCX, PPTX, Images, HTML) into Markdown and JSON&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;* [https://github.com/microsoft/markitdown Microsoft Markitdown]: converts various formats (PDF, Word, Excel, PPT) to Markdown (available via [https://msftmd.replit.app/ web interface on replit])&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;* [https://github.com/microsoft/markitdown Microsoft Markitdown]: converts various formats (PDF, Word, Excel, PPT) to Markdown (available via [https://msftmd.replit.app/ web interface on replit])&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;/table&gt;</summary>
		<author><name>KevinYager</name></author>
		
	</entry>
	<entry>
		<id>http://gisaxs.com/index.php?title=Data_Extraction&amp;diff=7722&amp;oldid=prev</id>
		<title>KevinYager: /* Document Parsing */</title>
		<link rel="alternate" type="text/html" href="http://gisaxs.com/index.php?title=Data_Extraction&amp;diff=7722&amp;oldid=prev"/>
		<updated>2025-05-07T15:55:30Z</updated>

		<summary type="html">&lt;p&gt;‎&lt;span dir=&quot;auto&quot;&gt;&lt;span class=&quot;autocomment&quot;&gt;Document Parsing&lt;/span&gt;&lt;/span&gt;&lt;/p&gt;
&lt;table class=&quot;diff diff-contentalign-left&quot; data-mw=&quot;interface&quot;&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;tr class=&quot;diff-title&quot; lang=&quot;en&quot;&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;← Older revision&lt;/td&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;Revision as of 15:55, 7 May 2025&lt;/td&gt;
				&lt;/tr&gt;&lt;tr&gt;&lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot; id=&quot;mw-diff-left-l54&quot; &gt;Line 54:&lt;/td&gt;
&lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot;&gt;Line 54:&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;* [https://arxiv.org/abs/2503.11576 SmolDocling: An ultra-compact vision-language model for end-to-end multi-modal document conversion] 256M ([https://huggingface.co/ds4sd/SmolDocling-256M-preview hf])&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;* [https://arxiv.org/abs/2503.11576 SmolDocling: An ultra-compact vision-language model for end-to-end multi-modal document conversion] 256M ([https://huggingface.co/ds4sd/SmolDocling-256M-preview hf])&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;* [https://arxiv.org/abs/2501.06659 TWIX: Automatically Reconstructing Structured Data from Templatized Documents]&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;* [https://arxiv.org/abs/2501.06659 TWIX: Automatically Reconstructing Structured Data from Templatized Documents]&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td colspan=&quot;2&quot;&gt;&amp;#160;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;+&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #a3d3ff; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&lt;ins style=&quot;font-weight: bold; text-decoration: none;&quot;&gt;* [https://github.com/shcherbak-ai/contextgem ContextGem]: LLM-based extraction from documents&lt;/ins&gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;==PDF Conversion==&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;==PDF Conversion==&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;/table&gt;</summary>
		<author><name>KevinYager</name></author>
		
	</entry>
	<entry>
		<id>http://gisaxs.com/index.php?title=Data_Extraction&amp;diff=7700&amp;oldid=prev</id>
		<title>KevinYager: /* Document Parsing */</title>
		<link rel="alternate" type="text/html" href="http://gisaxs.com/index.php?title=Data_Extraction&amp;diff=7700&amp;oldid=prev"/>
		<updated>2025-05-03T13:33:06Z</updated>

		<summary type="html">&lt;p&gt;‎&lt;span dir=&quot;auto&quot;&gt;&lt;span class=&quot;autocomment&quot;&gt;Document Parsing&lt;/span&gt;&lt;/span&gt;&lt;/p&gt;
&lt;table class=&quot;diff diff-contentalign-left&quot; data-mw=&quot;interface&quot;&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;tr class=&quot;diff-title&quot; lang=&quot;en&quot;&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;← Older revision&lt;/td&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;Revision as of 13:33, 3 May 2025&lt;/td&gt;
				&lt;/tr&gt;&lt;tr&gt;&lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot; id=&quot;mw-diff-left-l53&quot; &gt;Line 53:&lt;/td&gt;
&lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot;&gt;Line 53:&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;* [https://github.com/VikParuchuri/marker Marker]: PDFs and images to markdown&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;* [https://github.com/VikParuchuri/marker Marker]: PDFs and images to markdown&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;* [https://arxiv.org/abs/2503.11576 SmolDocling: An ultra-compact vision-language model for end-to-end multi-modal document conversion] 256M ([https://huggingface.co/ds4sd/SmolDocling-256M-preview hf])&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;* [https://arxiv.org/abs/2503.11576 SmolDocling: An ultra-compact vision-language model for end-to-end multi-modal document conversion] 256M ([https://huggingface.co/ds4sd/SmolDocling-256M-preview hf])&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td colspan=&quot;2&quot;&gt;&amp;#160;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;+&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #a3d3ff; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&lt;ins style=&quot;font-weight: bold; text-decoration: none;&quot;&gt;* [https://arxiv.org/abs/2501.06659 TWIX: Automatically Reconstructing Structured Data from Templatized Documents]&lt;/ins&gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;==PDF Conversion==&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;==PDF Conversion==&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;/table&gt;</summary>
		<author><name>KevinYager</name></author>
		
	</entry>
	<entry>
		<id>http://gisaxs.com/index.php?title=Data_Extraction&amp;diff=7625&amp;oldid=prev</id>
		<title>KevinYager: /* Code &amp; Github */</title>
		<link rel="alternate" type="text/html" href="http://gisaxs.com/index.php?title=Data_Extraction&amp;diff=7625&amp;oldid=prev"/>
		<updated>2025-04-18T17:34:36Z</updated>

		<summary type="html">&lt;p&gt;‎&lt;span dir=&quot;auto&quot;&gt;&lt;span class=&quot;autocomment&quot;&gt;Code &amp;amp; Github&lt;/span&gt;&lt;/span&gt;&lt;/p&gt;
&lt;table class=&quot;diff diff-contentalign-left&quot; data-mw=&quot;interface&quot;&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;tr class=&quot;diff-title&quot; lang=&quot;en&quot;&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;← Older revision&lt;/td&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;Revision as of 17:34, 18 April 2025&lt;/td&gt;
				&lt;/tr&gt;&lt;tr&gt;&lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot; id=&quot;mw-diff-left-l33&quot; &gt;Line 33:&lt;/td&gt;
&lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot;&gt;Line 33:&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;* [https://github.com/tesserato/CodeWeaver CodeWeaver]: Generate a Markdown Document of Your Codebase Structure and Content&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;* [https://github.com/tesserato/CodeWeaver CodeWeaver]: Generate a Markdown Document of Your Codebase Structure and Content&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;* [https://github.com/Doriandarko/RepoToTextForLLMs RepoToTextForLLMs]&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;* [https://github.com/Doriandarko/RepoToTextForLLMs RepoToTextForLLMs]&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td colspan=&quot;2&quot;&gt;&amp;#160;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;+&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #a3d3ff; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&lt;ins style=&quot;font-weight: bold; text-decoration: none;&quot;&gt;* [https://uithub.com/ uithub]: Change &amp;#039;g&amp;#039; to &amp;#039;u&amp;#039; in github to get llm.txt&lt;/ins&gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;===Related===&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;===Related===&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;/table&gt;</summary>
		<author><name>KevinYager</name></author>
		
	</entry>
	<entry>
		<id>http://gisaxs.com/index.php?title=Data_Extraction&amp;diff=7526&amp;oldid=prev</id>
		<title>KevinYager: /* llms.txt Generator (Online) */</title>
		<link rel="alternate" type="text/html" href="http://gisaxs.com/index.php?title=Data_Extraction&amp;diff=7526&amp;oldid=prev"/>
		<updated>2025-04-10T15:24:49Z</updated>

		<summary type="html">&lt;p&gt;‎&lt;span dir=&quot;auto&quot;&gt;&lt;span class=&quot;autocomment&quot;&gt;llms.txt Generator (Online)&lt;/span&gt;&lt;/span&gt;&lt;/p&gt;
&lt;table class=&quot;diff diff-contentalign-left&quot; data-mw=&quot;interface&quot;&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;tr class=&quot;diff-title&quot; lang=&quot;en&quot;&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;← Older revision&lt;/td&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;Revision as of 15:24, 10 April 2025&lt;/td&gt;
				&lt;/tr&gt;&lt;tr&gt;&lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot; id=&quot;mw-diff-left-l19&quot; &gt;Line 19:&lt;/td&gt;
&lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot;&gt;Line 19:&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;==llms.txt Generator (Online)==&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;==llms.txt Generator (Online)==&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;* Firecrawl [https://llmstxt.firecrawl.dev/ LLMs.txt generator] (online tool, [https://docs.firecrawl.dev/features/alpha/llmstxt API])&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;* Firecrawl [https://llmstxt.firecrawl.dev/ LLMs.txt generator] (online tool, [https://docs.firecrawl.dev/features/alpha/llmstxt API])&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;−&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #ffe49c; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;** Just add llmstxt.new in front of any URL&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;+&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #a3d3ff; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;** Just add llmstxt.new in front of any URL&lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;; e.g. [http://llmstxt.new/http://yager-research.ca http://llmstxt.new/http://yager-research.ca]&lt;/ins&gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;* Jina AI [https://github.com/jina-ai/reader Reader]: Convert any URL to an LLM-friendly input with a prefix: https://r.jina.ai/&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;* Jina AI [https://github.com/jina-ai/reader Reader]: Convert any URL to an LLM-friendly input with a prefix: https://r.jina.ai/&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;* [https://www.arxiv-txt.org/ arXiv-txt]: Convert any arXiv paper to llms.txt version&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;* [https://www.arxiv-txt.org/ arXiv-txt]: Convert any arXiv paper to llms.txt version&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;/table&gt;</summary>
		<author><name>KevinYager</name></author>
		
	</entry>
	<entry>
		<id>http://gisaxs.com/index.php?title=Data_Extraction&amp;diff=7525&amp;oldid=prev</id>
		<title>KevinYager: /* llms.txt Generator (Online) */</title>
		<link rel="alternate" type="text/html" href="http://gisaxs.com/index.php?title=Data_Extraction&amp;diff=7525&amp;oldid=prev"/>
		<updated>2025-04-10T15:21:28Z</updated>

		<summary type="html">&lt;p&gt;‎&lt;span dir=&quot;auto&quot;&gt;&lt;span class=&quot;autocomment&quot;&gt;llms.txt Generator (Online)&lt;/span&gt;&lt;/span&gt;&lt;/p&gt;
&lt;table class=&quot;diff diff-contentalign-left&quot; data-mw=&quot;interface&quot;&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;tr class=&quot;diff-title&quot; lang=&quot;en&quot;&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;← Older revision&lt;/td&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;Revision as of 15:21, 10 April 2025&lt;/td&gt;
				&lt;/tr&gt;&lt;tr&gt;&lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot; id=&quot;mw-diff-left-l19&quot; &gt;Line 19:&lt;/td&gt;
&lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot;&gt;Line 19:&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;==llms.txt Generator (Online)==&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;==llms.txt Generator (Online)==&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;* Firecrawl [https://llmstxt.firecrawl.dev/ LLMs.txt generator] (online tool, [https://docs.firecrawl.dev/features/alpha/llmstxt API])&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;* Firecrawl [https://llmstxt.firecrawl.dev/ LLMs.txt generator] (online tool, [https://docs.firecrawl.dev/features/alpha/llmstxt API])&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td colspan=&quot;2&quot;&gt;&amp;#160;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;+&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #a3d3ff; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&lt;ins style=&quot;font-weight: bold; text-decoration: none;&quot;&gt;** Just add llmstxt.new in front of any URL&lt;/ins&gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;* Jina AI [https://github.com/jina-ai/reader Reader]: Convert any URL to an LLM-friendly input with a prefix: https://r.jina.ai/&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;* Jina AI [https://github.com/jina-ai/reader Reader]: Convert any URL to an LLM-friendly input with a prefix: https://r.jina.ai/&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;* [https://www.arxiv-txt.org/ arXiv-txt]: Convert any arXiv paper to llms.txt version&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;* [https://www.arxiv-txt.org/ arXiv-txt]: Convert any arXiv paper to llms.txt version&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;/table&gt;</summary>
		<author><name>KevinYager</name></author>
		
	</entry>
</feed>