operators.candidates.rich_doc_features.RichDocRegexNGramDetector
class operators.candidates.richdocfeatures.RichDocRegexNGramDetector(regex, targetfield=None, capturegroup=0, case_sensitive=True)
operators.candidates.rich_doc_features.RichDocRegexPageFeaturizer
class operators.candidates.richdocfeatures.RichDocRegexPageFeaturizer(regexpattern, casesensitive=False)
operators.candidates.rich_doc_features.RichDocSpanBaseFeaturesPreprocessor
class operators.candidates.richdocfeatures.RichDocSpanBaseFeaturesPreprocessor
operators.candidates.rich_doc_features.RichDocSpanRowFeaturesPreprocessor
class operators.candidates.richdocfeatures.RichDocSpanRowFeaturesPreprocessor(rowid=False, rowtextbefore=0, rowtextinline=False, rowtextafter=0, rowheader=False...
operators.candidates.rich_doc_features.RichDocSpanStructuralPreprocessor
class operators.candidates.richdocfeatures.RichDocSpanStructuralPreprocessor(window=1, scopeunit='line', direction='before or after', featurename_override=None)
operators.candidates.rich_doc_features.RichDocSpanVisualPreprocessor
class operators.candidates.richdocfeatures.RichDocSpanVisualPreprocessor(location='center', scope='page', threshold=50, thresholdunit='pixels', thresholddir='le...
operators.candidates.rich_doc_page.RichDocPagePreprocessor
class operators.candidates.richdocpage.RichDocPagePreprocessor
operators.pdf.checkbox.CheckboxFeaturizer
class operators.pdf.checkbox.CheckboxFeaturizer(pdfurlfield='richdocpdfurl', minboxlengthpx=25, maxboxlengthpx=55, pxthresholdratio=0.1, numpagesperbatch=100, p...
operators.pdf.checkbox.CheckboxSpanMapper
class operators.pdf.checkbox.CheckboxSpanMapper(pxdistance=100, leftcheckboxes=True, rightcheckboxes=False, topcheckboxes=False, bottom_checkboxes=False)
operators.pdf.hocr.HocrToRichDocParser
class operators.pdf.hocr.HocrToRichDocParser(field, dropfield=True, ignoreerrors=False)
operators.pdf.hocr.TruncateHOCR
class operators.pdf.hocr.TruncateHOCR(field, targetfield=None, pages=5, ignoreerrors=False)
operators.pdf.lines.LinesFeaturizer
class operators.pdf.lines.LinesFeaturizer(field, minlengthpx=21, numpagesperbatch=100, pagesfield='context_pages')
operators.pdf.lines.LinesPageFilterFeaturizer
class operators.pdf.lines.LinesPageFilterFeaturizer(pages_field)
operators.pdf.page_splitter.PageSplitter
class operators.pdf.pagesplitter.PageSplitter(windowsize=0)
operators.pdf.parser.PDFToRichDocParser
class operators.pdf.parser.PDFToRichDocParser(field, removesuperscripts=False, pagesfield=None, extractpars=False, parserparams='{"charmargin" 0.1, "alltexts": ...
operators.pdf.parser2.PDFToRichDocParser2
class operators.pdf.parser2.PDFToRichDocParser2(field, parser_version=1)
operators.pdf.table.TableFeaturizer
class operators.pdf.table.TableFeaturizer(field='richdocpdfurl', model='microsoft/table-transformer-structure-recognition', pagesfield=None)
operators.pdf.table.TableSpanMapper
class operators.pdf.table.TableSpanMapper(confidence_threshold=0.9)
operators.pdf.text_cluster.TextClusterSpanExtractor
class operators.pdf.text_cluster.TextClusterSpanExtractor
operators.pdf.text_cluster.TextClusterSpanFeaturizer
class operators.pdf.text_cluster.TextClusterSpanFeaturizer
operators.pdf.text_cluster.TextClusterer
class operators.pdf.textcluster.TextClusterer(wordspacingtolerance=0.75, mergewordsbetweenverticallines=False, mergerowsbetweenhorizontallines=False, pagesfield...
operators.pdf.truncate_pdf.TruncatePDF
class operators.pdf.truncatepdf.TruncatePDF(field, pdfstoragedir, targetfield=None, pages=5, ignore_errors=False)
operators.row_filter.TableRowFilter
class operators.rowfilter.TableRowFilter(confidencescore=0.9)