How to use jupyter nbconvert
最近在使用jupyter notebook的时候,发现notebook文件在问题探索方面非常方便,但是交付的话,还是期望能将其转换为python源文件。要实现notebook源文件(.ipynb)与python源文件(.py)之间的相互转换,可以使用命令jupyter nbconvert
来完成。举例如下,
这里有一个文件名称为,内容如下:
lanzhou) lwk@qwfys:~/Public/project/python/alink_tutorial_python/pyalink$ cat Chap14.ipynb
{"cells": [{"cell_type": "code","execution_count": null,"metadata": {},"outputs": [],"source": ["from pyalink.alink import *\n","useLocalEnv(1)\n","\n","from utils import *\n","import os\n","import pandas as pd\n","\n","pd.set_option('display.max_colwidth', 1000)\n","\n","DATA_DIR = ROOT_DIR + \"ctr_avazu\" + os.sep\n","\n","SCHEMA_STRING\\\n"," = \"id string, click string, dt string, C1 string, banner_pos int, site_id string, site_domain string, \"\\\n"," + \"site_category string, app_id string, app_domain string, app_category string, device_id string, \"\\\n"," + \"device_ip string, device_model string, device_type string, device_conn_type string, C14 int, C15 int, \"\\\n"," + \"C16 int, C17 int, C18 int, C19 int, C20 int, C21 int\"\n","\n","CATEGORY_COL_NAMES = [\n"," \"C1\", \"banner_pos\", \"site_category\", \"app_domain\",\n"," \"app_category\", \"device_type\", \"device_conn_type\",\n"," \"site_id\", \"site_domain\", \"device_id\", \"device_model\"\n","]\n","\n","NUMERICAL_COL_NAMES = [\"C14\", \"C15\", \"C16\", \"C17\", \"C18\", \"C19\", \"C20\", \"C21\"]\n","\n","FEATURE_MODEL_FILE = \"feature_model.ak\"\n","INIT_MODEL_FILE = \"init_model.ak\"\n","\n","LABEL_COL_NAME = \"click\"\n","VEC_COL_NAME = \"vec\"\n","PREDICTION_COL_NAME = \"pred\"\n","PRED_DETAIL_COL_NAME = \"pred_info\"\n","\n","NUM_HASH_FEATURES = 30000\n"]},{"cell_type": "code","execution_count": null,"metadata": {},"outputs": [],"source": ["#c_2\n","TextSourceBatchOp()\\\n"," .setFilePath(\"http://alink-release.oss-cn-beijing.aliyuncs.com/\"\n"," + \"data-files/avazu-small.csv\")\\\n"," .firstN(10)\\\n"," .print()\n","\n","trainBatchData = CsvSourceBatchOp()\\\n"," .setFilePath(\"http://alink-release.oss-cn-beijing.aliyuncs.com/\"\n"," + \"data-files/avazu-small.csv\")\\\n"," .setSchemaStr(SCHEMA_STRING);\n","\n","trainBatchData.firstN(10).print();\n"]},{"cell_type": "code","execution_count": null,"metadata": {},"outputs": [],"source": ["#c_3\n","trainBatchData = CsvSourceBatchOp()\\\n"," .setFilePath(\"http://alink-release.oss-cn-beijing.aliyuncs.com/\"\n"," + \"data-files/avazu-small.csv\")\\\n"," .setSchemaStr(SCHEMA_STRING);\n","\n","feature_pipeline = Pipeline()\\\n"," .add(\n"," StandardScaler()\\\n"," .setSelectedCols(NUMERICAL_COL_NAMES)\n"," )\\\n"," .add(\n"," FeatureHasher()\\\n"," .setSelectedCols(CATEGORY_COL_NAMES + NUMERICAL_COL_NAMES)\\\n"," .setCategoricalCols(CATEGORY_COL_NAMES)\\\n"," .setOutputCol(VEC_COL_NAME)\\\n"," .setNumFeatures(NUM_HASH_FEATURES)\n"," );\n","\n","if not(os.path.exists(DATA_DIR + FEATURE_MODEL_FILE)) :\n"," feature_pipeline\\\n"," .fit(trainBatchData)\\\n"," .save(DATA_DIR + FEATURE_MODEL_FILE)\n"," BatchOperator.execute()\n"]},{"cell_type": "code","execution_count": null,"metadata": {},"outputs": [],"source": ["#c_4\n","feature_pipelineModel = PipelineModel.load(DATA_DIR + FEATURE_MODEL_FILE)\n","\n","data = CsvSourceStreamOp()\\\n"," .setFilePath(\"http://alink-release.oss-cn-beijing.aliyuncs.com/\"\n"," + \"data-files/avazu-ctr-train-8M.csv\")\\\n"," .setSchemaStr(SCHEMA_STRING);\n","\n","if not(os.path.exists(DATA_DIR + INIT_MODEL_FILE)) :\n"," trainBatchData = CsvSourceBatchOp()\\\n"," .setFilePath(\"http://alink-release.oss-cn-beijing.aliyuncs.com/\"\n"," + \"data-files/avazu-small.csv\")\\\n"," .setSchemaStr(SCHEMA_STRING);\n","\n"," lr = LogisticRegressionTrainBatchOp()\\\n"," .setVectorCol(VEC_COL_NAME)\\\n"," .setLabelCol(LABEL_COL_NAME)\\\n"," .setWithIntercept(True)\\\n"," .setMaxIter(10);\n","\n"," feature_pipelineModel\\\n"," .transform(trainBatchData)\\\n"," .link(lr)\\\n"," .link(\n"," AkSinkBatchOp().setFilePath(DATA_DIR + INIT_MODEL_FILE)\n"," );\n"," BatchOperator.execute();\n"]},{"cell_type": "code","execution_count": null,"metadata": {},"outputs": [],"source": ["#c_5 \n","feature_pipelineModel = PipelineModel.load(DATA_DIR + FEATURE_MODEL_FILE);\n","\n","initModel = AkSourceBatchOp().setFilePath(DATA_DIR + INIT_MODEL_FILE);\n","\n","data = CsvSourceStreamOp()\\\n"," .setFilePath(\"http://alink-release.oss-cn-beijing.aliyuncs.com/\"\n"," + \"data-files/avazu-ctr-train-8M.csv\")\\\n"," .setSchemaStr(SCHEMA_STRING)\\\n"," .setIgnoreFirstLine(True)\n","\n","spliter = SplitStreamOp().setFraction(0.5).linkFrom(data);\n","train_stream_data = feature_pipelineModel.transform(spliter);\n","test_stream_data = feature_pipelineModel.transform(spliter.getSideOutput(0));\n","\n","model = FtrlTrainStreamOp(initModel)\\\n"," .setVectorCol(VEC_COL_NAME)\\\n"," .setLabelCol(LABEL_COL_NAME)\\\n"," .setWithIntercept(True)\\\n"," .setAlpha(0.1)\\\n"," .setBeta(0.1)\\\n"," .setL1(0.01)\\\n"," .setL2(0.01)\\\n"," .setTimeInterval(10)\\\n"," .setVectorSize(NUM_HASH_FEATURES)\\\n"," .linkFrom(train_stream_data);\n","\n","predResult = FtrlPredictStreamOp(initModel)\\\n"," .setVectorCol(VEC_COL_NAME)\\\n"," .setPredictionCol(PREDICTION_COL_NAME)\\\n"," .setReservedCols([LABEL_COL_NAME])\\\n"," .setPredictionDetailCol(PRED_DETAIL_COL_NAME)\\\n"," .linkFrom(model, test_stream_data);\n","\n","# predResult\\\n","# .sample(0.0001)\\\n","# .select(\"'Pred Sample' AS out_type, *\")\\\n","# .print();\n","\n","predResult.print(key=\"predResult\", refreshInterval = 30, maxLimit=20)"]},{"cell_type": "code","execution_count": null,"metadata": {},"outputs": [],"source": ["predResult\\\n"," .link(\n"," EvalBinaryClassStreamOp()\\\n"," .setLabelCol(LABEL_COL_NAME)\\\n"," .setPredictionDetailCol(PRED_DETAIL_COL_NAME)\\\n"," .setTimeInterval(10)\n"," )\\\n"," .link(\n"," JsonValueStreamOp()\\\n"," .setSelectedCol(\"Data\")\\\n"," .setReservedCols([\"Statistics\"])\\\n"," .setOutputCols([\"Accuracy\", \"AUC\", \"ConfusionMatrix\"])\\\n"," .setJsonPath([\"$.Accuracy\", \"$.AUC\", \"$.ConfusionMatrix\"])\n"," )\\\n"," .print(key=\"evaluation\", refreshInterval = 30, maxLimit=20)\n","# .select(\"'Eval Metric' AS out_type, *\")\\\n","# .print();\n","\n","StreamOperator.execute();\n"]},{"cell_type": "code","execution_count": null,"metadata": {},"outputs": [],"source": ["#c_6\n","data = CsvSourceStreamOp()\\\n"," .setFilePath(\"http://alink-release.oss-cn-beijing.aliyuncs.com/\"\n"," + \"data-files/avazu-ctr-train-8M.csv\")\\\n"," .setSchemaStr(SCHEMA_STRING)\\\n"," .setIgnoreFirstLine(True);\n","\n","feature_pipelineModel = PipelineModel.load(DATA_DIR + FEATURE_MODEL_FILE);\n","\n","spliter = SplitStreamOp().setFraction(0.5).linkFrom(data);\n","train_stream_data = feature_pipelineModel.transform(spliter);\n","test_stream_data = feature_pipelineModel.transform(spliter.getSideOutput(0));\n","\n","initModel = AkSourceBatchOp().setFilePath(DATA_DIR + INIT_MODEL_FILE);\n","\n","model = FtrlTrainStreamOp(initModel)\\\n"," .setVectorCol(VEC_COL_NAME)\\\n"," .setLabelCol(LABEL_COL_NAME)\\\n"," .setWithIntercept(True)\\\n"," .setAlpha(0.1)\\\n"," .setBeta(0.1)\\\n"," .setL1(0.01)\\\n"," .setL2(0.01)\\\n"," .setTimeInterval(10)\\\n"," .setVectorSize(NUM_HASH_FEATURES)\\\n"," .linkFrom(train_stream_data);\n","\n","model_filter = FtrlModelFilterStreamOp()\\\n"," .setPositiveLabelValueString(\"1\")\\\n"," .setVectorCol(VEC_COL_NAME)\\\n"," .setLabelCol(LABEL_COL_NAME)\\\n"," .setAccuracyThreshold(0.83)\\\n"," .setAucThreshold(0.71)\\\n"," .linkFrom(model, train_stream_data);\n","\n","model_filter\\\n"," .select(\"'Model' AS out_type, *\")\\\n"," .print();\n","\n","predResult = FtrlPredictStreamOp(initModel)\\\n"," .setVectorCol(VEC_COL_NAME)\\\n"," .setPredictionCol(PREDICTION_COL_NAME)\\\n"," .setReservedCols([LABEL_COL_NAME])\\\n"," .setPredictionDetailCol(PRED_DETAIL_COL_NAME)\\\n"," .linkFrom(model_filter, test_stream_data);\n","\n","predResult\\\n"," .sample(0.0001)\\\n"," .select(\"'Pred Sample' AS out_type, *\")\\\n"," .print();\n","\n","predResult\\\n"," .link(\n"," EvalBinaryClassStreamOp()\\\n"," .setPositiveLabelValueString(\"1\")\\\n"," .setLabelCol(LABEL_COL_NAME)\\\n"," .setPredictionDetailCol(PRED_DETAIL_COL_NAME)\\\n"," .setTimeInterval(10)\n"," )\\\n"," .link(\n"," JsonValueStreamOp()\\\n"," .setSelectedCol(\"Data\")\\\n"," .setReservedCols([\"Statistics\"])\\\n"," .setOutputCols([\"Accuracy\", \"AUC\", \"ConfusionMatrix\"])\\\n"," .setJsonPath([\"$.Accuracy\", \"$.AUC\", \"$.ConfusionMatrix\"])\n"," )\\\n"," .select(\"'Eval Metric' AS out_type, *\")\\\n"," .print();\n","\n","StreamOperator.execute();\n"]},{"cell_type": "code","execution_count": null,"metadata": {},"outputs": [],"source": []}],"metadata": {"kernelspec": {"display_name": "Python 3","language": "python","name": "python3"},"language_info": {"codemirror_mode": {"name": "ipython","version": 3},"file_extension": ".py","mimetype": "text/x-python","name": "python","nbconvert_exporter": "python","pygments_lexer": "ipython3","version": "3.8.8"}},"nbformat": 4,"nbformat_minor": 4
}
(lanzhou) lwk@qwfys:~/Public/project/python/alink_tutorial_python/pyalink$
接下来,我们借助命令jupyter nbconvert
将其转换为.py文件,命令如下:
(lanzhou) lwk@qwfys:~/Public/project/python/alink_tutorial_python/pyalink$ mkdir -p python
(lanzhou) lwk@qwfys:~/Public/project/python/alink_tutorial_python/pyalink$ jupyter nbconvert --to python Chap14.ipynb --output-dir python
[NbConvertApp] Converting notebook Chap14.ipynb to python
[NbConvertApp] Writing 7347 bytes to python/Chap14.py
(lanzhou) lwk@qwfys:~/Public/project/python/alink_tutorial_python/pyalink$
我们看到,已经在python目录下生成了文件Chap14.py。
接下来,我们看一下生成的Chap14.py文件的内容:
(lanzhou) lwk@qwfys:~/Public/project/python/alink_tutorial_python/pyalink$ cat python/Chap14.py
#!/usr/bin/env python
# coding: utf-8# In[ ]:from pyalink.alink import *
useLocalEnv(1)from utils import *
import os
import pandas as pdpd.set_option('display.max_colwidth', 1000)DATA_DIR = ROOT_DIR + "ctr_avazu" + os.sepSCHEMA_STRING\= "id string, click string, dt string, C1 string, banner_pos int, site_id string, site_domain string, "\+ "site_category string, app_id string, app_domain string, app_category string, device_id string, "\+ "device_ip string, device_model string, device_type string, device_conn_type string, C14 int, C15 int, "\+ "C16 int, C17 int, C18 int, C19 int, C20 int, C21 int"CATEGORY_COL_NAMES = ["C1", "banner_pos", "site_category", "app_domain","app_category", "device_type", "device_conn_type","site_id", "site_domain", "device_id", "device_model"
]NUMERICAL_COL_NAMES = ["C14", "C15", "C16", "C17", "C18", "C19", "C20", "C21"]FEATURE_MODEL_FILE = "feature_model.ak"
INIT_MODEL_FILE = "init_model.ak"LABEL_COL_NAME = "click"
VEC_COL_NAME = "vec"
PREDICTION_COL_NAME = "pred"
PRED_DETAIL_COL_NAME = "pred_info"NUM_HASH_FEATURES = 30000# In[ ]:#c_2
TextSourceBatchOp()\.setFilePath("http://alink-release.oss-cn-beijing.aliyuncs.com/"+ "data-files/avazu-small.csv")\.firstN(10)\.print()trainBatchData = CsvSourceBatchOp()\.setFilePath("http://alink-release.oss-cn-beijing.aliyuncs.com/"+ "data-files/avazu-small.csv")\.setSchemaStr(SCHEMA_STRING);trainBatchData.firstN(10).print();# In[ ]:#c_3
trainBatchData = CsvSourceBatchOp()\.setFilePath("http://alink-release.oss-cn-beijing.aliyuncs.com/"+ "data-files/avazu-small.csv")\.setSchemaStr(SCHEMA_STRING);feature_pipeline = Pipeline()\.add(StandardScaler()\.setSelectedCols(NUMERICAL_COL_NAMES))\.add(FeatureHasher()\.setSelectedCols(CATEGORY_COL_NAMES + NUMERICAL_COL_NAMES)\.setCategoricalCols(CATEGORY_COL_NAMES)\.setOutputCol(VEC_COL_NAME)\.setNumFeatures(NUM_HASH_FEATURES));if not(os.path.exists(DATA_DIR + FEATURE_MODEL_FILE)) :feature_pipeline\.fit(trainBatchData)\.save(DATA_DIR + FEATURE_MODEL_FILE)BatchOperator.execute()# In[ ]:#c_4
feature_pipelineModel = PipelineModel.load(DATA_DIR + FEATURE_MODEL_FILE)data = CsvSourceStreamOp()\.setFilePath("http://alink-release.oss-cn-beijing.aliyuncs.com/"+ "data-files/avazu-ctr-train-8M.csv")\.setSchemaStr(SCHEMA_STRING);if not(os.path.exists(DATA_DIR + INIT_MODEL_FILE)) :trainBatchData = CsvSourceBatchOp()\.setFilePath("http://alink-release.oss-cn-beijing.aliyuncs.com/"+ "data-files/avazu-small.csv")\.setSchemaStr(SCHEMA_STRING);lr = LogisticRegressionTrainBatchOp()\.setVectorCol(VEC_COL_NAME)\.setLabelCol(LABEL_COL_NAME)\.setWithIntercept(True)\.setMaxIter(10);feature_pipelineModel\.transform(trainBatchData)\.link(lr)\.link(AkSinkBatchOp().setFilePath(DATA_DIR + INIT_MODEL_FILE));BatchOperator.execute();# In[ ]:#c_5
feature_pipelineModel = PipelineModel.load(DATA_DIR + FEATURE_MODEL_FILE);initModel = AkSourceBatchOp().setFilePath(DATA_DIR + INIT_MODEL_FILE);data = CsvSourceStreamOp()\.setFilePath("http://alink-release.oss-cn-beijing.aliyuncs.com/"+ "data-files/avazu-ctr-train-8M.csv")\.setSchemaStr(SCHEMA_STRING)\.setIgnoreFirstLine(True)spliter = SplitStreamOp().setFraction(0.5).linkFrom(data);
train_stream_data = feature_pipelineModel.transform(spliter);
test_stream_data = feature_pipelineModel.transform(spliter.getSideOutput(0));model = FtrlTrainStreamOp(initModel)\.setVectorCol(VEC_COL_NAME)\.setLabelCol(LABEL_COL_NAME)\.setWithIntercept(True)\.setAlpha(0.1)\.setBeta(0.1)\.setL1(0.01)\.setL2(0.01)\.setTimeInterval(10)\.setVectorSize(NUM_HASH_FEATURES)\.linkFrom(train_stream_data);predResult = FtrlPredictStreamOp(initModel)\.setVectorCol(VEC_COL_NAME)\.setPredictionCol(PREDICTION_COL_NAME)\.setReservedCols([LABEL_COL_NAME])\.setPredictionDetailCol(PRED_DETAIL_COL_NAME)\.linkFrom(model, test_stream_data);# predResult\
# .sample(0.0001)\
# .select("'Pred Sample' AS out_type, *")\
# .print();predResult.print(key="predResult", refreshInterval = 30, maxLimit=20)# In[ ]:predResult\.link(EvalBinaryClassStreamOp()\.setLabelCol(LABEL_COL_NAME)\.setPredictionDetailCol(PRED_DETAIL_COL_NAME)\.setTimeInterval(10))\.link(JsonValueStreamOp()\.setSelectedCol("Data")\.setReservedCols(["Statistics"])\.setOutputCols(["Accuracy", "AUC", "ConfusionMatrix"])\.setJsonPath(["$.Accuracy", "$.AUC", "$.ConfusionMatrix"]))\.print(key="evaluation", refreshInterval = 30, maxLimit=20)
# .select("'Eval Metric' AS out_type, *")\
# .print();StreamOperator.execute();# In[ ]:#c_6
data = CsvSourceStreamOp()\.setFilePath("http://alink-release.oss-cn-beijing.aliyuncs.com/"+ "data-files/avazu-ctr-train-8M.csv")\.setSchemaStr(SCHEMA_STRING)\.setIgnoreFirstLine(True);feature_pipelineModel = PipelineModel.load(DATA_DIR + FEATURE_MODEL_FILE);spliter = SplitStreamOp().setFraction(0.5).linkFrom(data);
train_stream_data = feature_pipelineModel.transform(spliter);
test_stream_data = feature_pipelineModel.transform(spliter.getSideOutput(0));initModel = AkSourceBatchOp().setFilePath(DATA_DIR + INIT_MODEL_FILE);model = FtrlTrainStreamOp(initModel)\.setVectorCol(VEC_COL_NAME)\.setLabelCol(LABEL_COL_NAME)\.setWithIntercept(True)\.setAlpha(0.1)\.setBeta(0.1)\.setL1(0.01)\.setL2(0.01)\.setTimeInterval(10)\.setVectorSize(NUM_HASH_FEATURES)\.linkFrom(train_stream_data);model_filter = FtrlModelFilterStreamOp()\.setPositiveLabelValueString("1")\.setVectorCol(VEC_COL_NAME)\.setLabelCol(LABEL_COL_NAME)\.setAccuracyThreshold(0.83)\.setAucThreshold(0.71)\.linkFrom(model, train_stream_data);model_filter\.select("'Model' AS out_type, *")\.print();predResult = FtrlPredictStreamOp(initModel)\.setVectorCol(VEC_COL_NAME)\.setPredictionCol(PREDICTION_COL_NAME)\.setReservedCols([LABEL_COL_NAME])\.setPredictionDetailCol(PRED_DETAIL_COL_NAME)\.linkFrom(model_filter, test_stream_data);predResult\.sample(0.0001)\.select("'Pred Sample' AS out_type, *")\.print();predResult\.link(EvalBinaryClassStreamOp()\.setPositiveLabelValueString("1")\.setLabelCol(LABEL_COL_NAME)\.setPredictionDetailCol(PRED_DETAIL_COL_NAME)\.setTimeInterval(10))\.link(JsonValueStreamOp()\.setSelectedCol("Data")\.setReservedCols(["Statistics"])\.setOutputCols(["Accuracy", "AUC", "ConfusionMatrix"])\.setJsonPath(["$.Accuracy", "$.AUC", "$.ConfusionMatrix"]))\.select("'Eval Metric' AS out_type, *")\.print();StreamOperator.execute();# In[ ]:(lanzhou) lwk@qwfys:~/Public/project/python/alink_tutorial_python/pyalink$