1515import libzim .writer # pyright: ignore
1616
1717from zimscraperlib .download import stream_file
18+ from zimscraperlib .filesystem import get_content_mimetype , get_file_mimetype
19+ from zimscraperlib .zim .indexing import IndexData , get_pdf_index_data
1820from zimscraperlib .zim .providers import (
1921 FileLikeProvider ,
2022 FileProvider ,
@@ -69,7 +71,17 @@ class StaticItem(Item):
6971 Sets a `ref` to itself on the File/String content providers so it outlives them
7072 We need Item to survive its ContentProvider so that we can track lifecycle
7173 more efficiently: now when the libzim destroys the CP, python will destroy
72- the Item and we can be notified that we're effectively through with our content"""
74+ the Item and we can be notified that we're effectively through with our content
75+
76+ By default, content is automatically indexed (either by the libzim itself for
77+ supported documents - text or html for now or by the python-scraperlib - only PDF
78+ supported for now). If you do not want this, set `auto_index` to False to disable
79+ both indexing (libzim and python-scraperlib).
80+
81+ It is also possible to pass index_data to configure custom indexing of the item.
82+
83+ If item title is not set by caller, it is automatically populated from index_data.
84+ """
7385
7486 def __init__ (
7587 self ,
@@ -80,6 +92,9 @@ def __init__(
8092 title : str | None = None ,
8193 mimetype : str | None = None ,
8294 hints : dict | None = None ,
95+ index_data : IndexData | None = None ,
96+ * ,
97+ auto_index : bool = True ,
8398 ** kwargs : Any ,
8499 ):
85100 if content is not None :
@@ -91,6 +106,20 @@ def __init__(
91106 super ().__init__ (
92107 path = path , title = title , mimetype = mimetype , hints = hints , ** kwargs
93108 )
109+ if index_data :
110+ self .get_indexdata = lambda : index_data
111+ elif not auto_index :
112+ self .get_indexdata = lambda : IndexData ("" , "" ) # index nothing
113+ else :
114+ self ._get_auto_index () # consider to add auto index
115+
116+ # Populate item title from index data if title is not set by caller
117+ if (
118+ (not hasattr (self , "title" ) or not self .title )
119+ and hasattr (self , "get_indexdata" )
120+ and self .get_indexdata ().get_title ()
121+ ):
122+ self .title = self .get_indexdata ().get_title ()
94123
95124 def get_contentprovider (self ) -> libzim .writer .ContentProvider :
96125 # content was set manually
@@ -116,6 +145,53 @@ def get_contentprovider(self) -> libzim.writer.ContentProvider:
116145
117146 raise NotImplementedError ("No data to provide`" )
118147
148+ def _get_auto_index (self ):
149+ """Populate item index data and title automatically from content"""
150+
151+ # content was set manually
152+ content = getattr (self , "content" , None )
153+ if content is not None :
154+ if not isinstance (content , (str , bytes )):
155+ raise RuntimeError (
156+ f"Unexpected type for content: { type (content )} "
157+ ) # pragma: no cover
158+ mimetype = get_content_mimetype (
159+ content .encode ("utf-8" ) if isinstance (content , str ) else content
160+ )
161+ if mimetype == "application/pdf" :
162+ index_data = get_pdf_index_data (content = content )
163+ self .get_indexdata = lambda : index_data
164+ else :
165+ return
166+
167+ # using a file-like object
168+ fileobj = getattr (self , "fileobj" , None )
169+ if fileobj :
170+ if not isinstance (fileobj , io .BytesIO ):
171+ raise RuntimeError (
172+ f"Unexpected type for content: { type (fileobj )} "
173+ ) # pragma: no cover
174+ mimetype = get_content_mimetype (fileobj .getvalue ())
175+ if mimetype == "application/pdf" :
176+ index_data = get_pdf_index_data (fileobj = fileobj )
177+ self .get_indexdata = lambda : index_data
178+ else :
179+ return
180+
181+ # using a file path
182+ filepath = getattr (self , "filepath" , None )
183+ if filepath :
184+ if not isinstance (filepath , pathlib .Path ):
185+ raise RuntimeError (
186+ f"Unexpected type for content: { type (filepath )} "
187+ ) # pragma: no cover
188+ mimetype = get_file_mimetype (filepath )
189+ if mimetype == "application/pdf" :
190+ index_data = get_pdf_index_data (filepath = filepath )
191+ self .get_indexdata = lambda : index_data
192+ else :
193+ return
194+
119195
120196class URLItem (StaticItem ):
121197 """StaticItem to automatically fetch and feed an URL resource
0 commit comments