Skip to content

Commit d724a4b

Browse files
Ark-kunk8s-ci-robot
authored andcommitted
SDK - Controlling which modules are captured with Lightweight components (#1435)
* SDK - Controlling which modules are captured with Lightweight components All func_to_* functions now accept the modules_to_capture parameter: List of module names that will be captured (instead of just referencing) during the dependency scan. By default the func.__module__ is captured. * Described the behavior more in depth. * Added a test to check that only dependencies are captured
1 parent a3e7a0a commit d724a4b

File tree

4 files changed

+80
-14
lines changed

4 files changed

+80
-14
lines changed

sdk/python/kfp/components/_python_op.py

+25-14
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
from ._structures import *
2323

2424
from pathlib import Path
25-
from typing import TypeVar, Generic
25+
from typing import TypeVar, Generic, List
2626

2727
T = TypeVar('T')
2828

@@ -45,16 +45,23 @@ def _python_function_name_to_component_name(name):
4545
return re.sub(' +', ' ', name.replace('_', ' ')).strip(' ').capitalize()
4646

4747

48-
def _capture_function_code_using_cloudpickle(func) -> str:
48+
def _capture_function_code_using_cloudpickle(func, modules_to_capture: List[str] = None) -> str:
4949
import sys
5050
import cloudpickle
5151
import pickle
52+
53+
if modules_to_capture is None:
54+
modules_to_capture = [func.__module__]
55+
5256
# Hack to force cloudpickle to capture the whole function instead of just referencing the code file. See https://github.com/cloudpipe/cloudpickle/blob/74d69d759185edaeeac7bdcb7015cfc0c652f204/cloudpickle/cloudpickle.py#L490
57+
old_modules = {}
5358
try: # Try is needed to restore the state if something goes wrong
54-
old_module = sys.modules.pop(func.__module__)
59+
for module_name in modules_to_capture:
60+
if module_name in sys.modules:
61+
old_modules[module_name] = sys.modules.pop(module_name)
5562
func_pickle = cloudpickle.dumps(func, pickle.DEFAULT_PROTOCOL)
5663
finally:
57-
sys.modules[func.__module__] = old_module
64+
sys.modules.update(old_modules)
5865
func_code = '{func_name} = pickle.loads({func_pickle})'.format(func_name=func.__name__, func_pickle=repr(func_pickle))
5966

6067
code_lines = [
@@ -73,14 +80,15 @@ def _capture_function_code_using_cloudpickle(func) -> str:
7380
return '\n'.join(code_lines)
7481

7582

76-
def _func_to_component_spec(func, extra_code='', base_image=_default_base_image) -> ComponentSpec:
83+
def _func_to_component_spec(func, extra_code='', base_image=_default_base_image, modules_to_capture: List[str] = None) -> ComponentSpec:
7784
'''Takes a self-contained python function and converts it to component
7885
7986
Args:
8087
func: Required. The function to be converted
8188
base_image: Optional. Docker image to be used as a base image for the python component. Must have python 3.5+ installed. Default is tensorflow/tensorflow:1.11.0-py3
8289
Note: The image can also be specified by decorating the function with the @python_component decorator. If different base images are explicitly specified in both places, an error is raised.
8390
extra_code: Optional. Python source code that gets placed before the function code. Can be used as workaround to define types used in function signature.
91+
modules_to_capture: Optional. List of module names that will be captured (instead of just referencing) during the dependency scan. By default the func.__module__ is captured.
8492
'''
8593
decorator_base_image = getattr(func, '_component_base_image', None)
8694
if decorator_base_image is not None:
@@ -155,7 +163,7 @@ def annotation_to_type_struct(annotation):
155163

156164
func_name=func.__name__
157165

158-
func_code = _capture_function_code_using_cloudpickle(func)
166+
func_code = _capture_function_code_using_cloudpickle(func, modules_to_capture)
159167

160168
extra_output_external_names = [name + '_file' for name in extra_output_names]
161169

@@ -232,11 +240,11 @@ def annotation_to_type_struct(annotation):
232240
return component_spec
233241

234242

235-
def _func_to_component_dict(func, extra_code='', base_image=_default_base_image):
236-
return _func_to_component_spec(func, extra_code, base_image).to_dict()
243+
def _func_to_component_dict(func, extra_code='', base_image=_default_base_image, modules_to_capture: List[str] = None):
244+
return _func_to_component_spec(func, extra_code, base_image, modules_to_capture).to_dict()
237245

238246

239-
def func_to_component_text(func, extra_code='', base_image=_default_base_image):
247+
def func_to_component_text(func, extra_code='', base_image=_default_base_image, modules_to_capture: List[str] = None):
240248
'''
241249
Converts a Python function to a component definition and returns its textual representation
242250
@@ -254,15 +262,16 @@ def add_multiply_two_numbers(a: float, b: float) -> NamedTuple('DummyName', [('s
254262
base_image: Optional. Specify a custom Docker container image to use in the component. For lightweight components, the image needs to have python 3.5+. Default is tensorflow/tensorflow:1.11.0-py3
255263
Note: The image can also be specified by decorating the function with the @python_component decorator. If different base images are explicitly specified in both places, an error is raised.
256264
extra_code: Optional. Extra code to add before the function code. Can be used as workaround to define types used in function signature.
265+
modules_to_capture: Optional. List of module names that will be captured (instead of just referencing) during the dependency scan. By default the func.__module__ is captured. The actual algorithm: Starting with the initial function, start traversing dependencies. If the dependecy.__module__ is in the modules_to_capture list then it's captured and it's dependencies are traversed. Otherwise the dependency is only referenced instead of capturing and its dependencies are not traversed.
257266
258267
Returns:
259268
Textual representation of a component definition
260269
'''
261-
component_dict = _func_to_component_dict(func, extra_code, base_image)
270+
component_dict = _func_to_component_dict(func, extra_code, base_image, modules_to_capture)
262271
return dump_yaml(component_dict)
263272

264273

265-
def func_to_component_file(func, output_component_file, base_image=_default_base_image, extra_code='') -> None:
274+
def func_to_component_file(func, output_component_file, base_image=_default_base_image, extra_code='', modules_to_capture: List[str] = None) -> None:
266275
'''
267276
Converts a Python function to a component definition and writes it to a file
268277
@@ -281,14 +290,15 @@ def add_multiply_two_numbers(a: float, b: float) -> NamedTuple('DummyName', [('s
281290
base_image: Optional. Specify a custom Docker container image to use in the component. For lightweight components, the image needs to have python 3.5+. Default is tensorflow/tensorflow:1.11.0-py3
282291
Note: The image can also be specified by decorating the function with the @python_component decorator. If different base images are explicitly specified in both places, an error is raised.
283292
extra_code: Optional. Extra code to add before the function code. Can be used as workaround to define types used in function signature.
293+
modules_to_capture: Optional. List of module names that will be captured (instead of just referencing) during the dependency scan. By default the func.__module__ is captured. The actual algorithm: Starting with the initial function, start traversing dependencies. If the dependecy.__module__ is in the modules_to_capture list then it's captured and it's dependencies are traversed. Otherwise the dependency is only referenced instead of capturing and its dependencies are not traversed.
284294
'''
285295

286-
component_yaml = func_to_component_text(func, extra_code, base_image)
296+
component_yaml = func_to_component_text(func, extra_code, base_image, modules_to_capture)
287297

288298
Path(output_component_file).write_text(component_yaml)
289299

290300

291-
def func_to_container_op(func, output_component_file=None, base_image=_default_base_image, extra_code=''):
301+
def func_to_container_op(func, output_component_file=None, base_image=_default_base_image, extra_code='', modules_to_capture: List[str] = None):
292302
'''
293303
Converts a Python function to a component and returns a task (ContainerOp) factory
294304
@@ -307,13 +317,14 @@ def add_multiply_two_numbers(a: float, b: float) -> NamedTuple('DummyName', [('s
307317
Note: The image can also be specified by decorating the function with the @python_component decorator. If different base images are explicitly specified in both places, an error is raised.
308318
output_component_file: Optional. Write a component definition to a local file. Can be used for sharing.
309319
extra_code: Optional. Extra code to add before the function code. Can be used as workaround to define types used in function signature.
320+
modules_to_capture: Optional. List of module names that will be captured (instead of just referencing) during the dependency scan. By default the func.__module__ is captured. The actual algorithm: Starting with the initial function, start traversing dependencies. If the dependecy.__module__ is in the modules_to_capture list then it's captured and it's dependencies are traversed. Otherwise the dependency is only referenced instead of capturing and its dependencies are not traversed.
310321
311322
Returns:
312323
A factory function with a strongly-typed signature taken from the python function.
313324
Once called with the required arguments, the factory constructs a pipeline task instance (ContainerOp) that can run the original function in a container.
314325
'''
315326

316-
component_spec = _func_to_component_spec(func, extra_code, base_image)
327+
component_spec = _func_to_component_spec(func, extra_code, base_image, modules_to_capture)
317328

318329
output_component_file = output_component_file or getattr(func, '_component_target_component_file', None)
319330
if output_component_file:
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
module_level_variable = 10
2+
3+
4+
class ModuleLevelClass:
5+
def class_method(self, x):
6+
return x * module_level_variable
7+
8+
9+
def module_func(a: float) -> float:
10+
return a * 5
11+
12+
13+
def module_func_with_deps(a: float, b: float) -> float:
14+
return ModuleLevelClass().class_method(a) + module_func(b)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
from .module1 import module_func_with_deps
2+
3+
def module2_func_with_deps(a: float, b: float) -> float:
4+
return module_func_with_deps(a, b) + 10

sdk/python/tests/components/test_python_op.py

+37
Original file line numberDiff line numberDiff line change
@@ -130,12 +130,49 @@ def main_func(a: float, b: float) -> float:
130130

131131
self.helper_test_2_in_1_out_component_using_local_call(func, op)
132132

133+
def test_func_to_container_op_check_nothing_extra_captured(self):
134+
def f1():
135+
pass
136+
137+
def f2():
138+
pass
139+
140+
def main_func(a: float, b: float) -> float:
141+
f1()
142+
try:
143+
eval('f2()')
144+
except:
145+
return a + b
146+
raise AssertionError("f2 should not be captured, because it's not a dependency.")
147+
148+
expected_func = lambda a, b: a + b
149+
op = comp.func_to_container_op(main_func)
150+
151+
self.helper_test_2_in_1_out_component_using_local_call(expected_func, op)
152+
133153
def test_func_to_container_op_call_other_func_global(self):
134154
func = module_func_with_deps
135155
op = comp.func_to_container_op(func, output_component_file='comp.yaml')
136156

137157
self.helper_test_2_in_1_out_component_using_local_call(func, op)
138158

159+
def test_func_to_container_op_with_imported_func(self):
160+
from .test_data.module1 import module_func_with_deps as module1_func_with_deps
161+
func = module1_func_with_deps
162+
op = comp.func_to_container_op(func)
163+
164+
self.helper_test_2_in_1_out_component_using_local_call(func, op)
165+
166+
def test_func_to_container_op_with_imported_func2(self):
167+
from .test_data.module2_which_depends_on_module1 import module2_func_with_deps as module2_func_with_deps
168+
func = module2_func_with_deps
169+
op = comp.func_to_container_op(func, modules_to_capture=[
170+
'tests.components.test_data.module1',
171+
'tests.components.test_data.module2_which_depends_on_module1'
172+
])
173+
174+
self.helper_test_2_in_1_out_component_using_local_call(func, op)
175+
139176
def test_func_to_container_op_multiple_named_typed_outputs(self):
140177
from typing import NamedTuple
141178
def add_multiply_two_numbers(a: float, b: float) -> NamedTuple('DummyName', [('sum', float), ('product', float)]):

0 commit comments

Comments
 (0)