cuebook/cuelake

View on GitHub
api/seeddata/notebookTemplate.json

Summary

Maintainability
Test Coverage
[
  {
    "model": "genie.notebooktemplate",
    "pk": 1,
    "fields": {
      "template": {
        "info": {},
        "name": "{{name}}",
        "path": "/{{name}}",
        "config": {
          "looknfeel": "default",
          "personalizedMode": "false",
          "isZeppelinNotebookCronEnable": false
        },
        "version": "0.9.0",
        "noteForms": {},
        "noteParams": {},
        "paragraphs": [
          {
            "id": "paragraph_1616145245686_1473486751",
            "apps": [],
            "text": "%spark.pyspark\n# ==========================================\n# Connection Properties\n# ==========================================\ndBHost = \"{{sourceConnection_host}}\"\ndBPort = \"{{sourceConnection_port}}\"\ndBSID = \"{{sourceConnection_sid}}\"\ndBName = \"{{sourceConnection_database}}\"\ndBUsername = \"{{sourceConnection_username}}\"\ndBPassword = \"{{sourceConnection_password}}\"\ndBQuery = \"\"\"{{sqlQuery}}\"\"\"\nprimaryColumnName = \"{{primaryKeyColumn}}\"\n# ==========================================\n\nif \"{{sourceConnection_type}}\"==\"Oracle\":\n    df = spark.read \\\n        .format(\"jdbc\") \\\n        .option(\"url\", f\"jdbc:oracle:thin:@{dBHost}:{dBPort}/{dBSID}\") \\\n        .option(\"query\", dBQuery) \\\n        .option(\"password\", dBPassword) \\\n        .option(\"user\", dBUsername) \\\n        .option(\"driver\", \"oracle.jdbc.driver.OracleDriver\") \\\n        .load()\n        \nelif \"{{sourceConnection_type}}\"==\"Postgres\":\n    df = spark.read \\\n        .format(\"jdbc\") \\\n        .option(\"url\", f\"jdbc:postgresql://{dBHost}:{dBPort}/{dBName}\") \\\n        .option(\"query\", dBQuery) \\\n        .option(\"password\", dBPassword) \\\n        .option(\"user\", dBUsername) \\\n        .option(\"driver\", \"org.postgresql.Driver\")\n    if \"{{sourceConnection_sslRequired}}\" == \"True\":\n        df = df.option(\"sslmode\", \"require\")\n    df = df.load()\n    \nelif \"{{sourceConnection_type}}\"==\"MySQL\":\n    df = spark.read \\\n        .format(\"jdbc\") \\\n        .option(\"url\", f\"jdbc:mysql://{dBHost}:{dBPort}/{dBName}\") \\\n        .option(\"query\", dBQuery) \\\n        .option(\"password\", dBPassword) \\\n        .option(\"user\", dBUsername)\n    if \"{{sourceConnection_sslRequired}}\" == \"True\":\n        df = df.option(\"sslmode\", \"require\")\n    df = df.load()\n        \n# Set order by column for SortOrder\ndf = df.orderBy(\"{{primaryKeyColumn}}\")\n\n# Store results in temp table\ndf.createOrReplaceTempView(\"{{tempTableName}}\")",
            "user": "anonymous",
            "focus": true,
            "title": "Read Source Database",
            "config": {
              "title": true,
              "enabled": true,
              "results": {},
              "colWidth": 12,
              "fontSize": 9,
              "tableHide": false,
              "editorHide": false,
              "editorMode": "ace/mode/python",
              "editorSetting": {
                "language": "python",
                "completionKey": "TAB",
                "editOnDblClick": false,
                "completionSupport": true
              }
            },
            "status": "READY",
            "jobName": "paragraph_1619459617352_2047119125",
            "progress": 0,
            "settings": {
              "forms": {},
              "params": {}
            },
            "$$hashKey": "object:928",
            "dateCreated": "2021-04-26T17:53:37+0000",
            "dateUpdated": "2021-04-26T19:16:48+0000",
            "runtimeInfos": {},
            "progressUpdateIntervalMs": 500
          },
          {
            "id": "paragraph_1615528549981_865522288",
            "apps": [],
            "text": "%spark.sql\n-- Create table if not exist\nCREATE TABLE IF NOT EXISTS cuelake.{{destinationTableName}} USING iceberg \n  TBLPROPERTIES (\n        'write.metadata.delete-after-commit.enabled'= 'true',\n       'write.metadata.previous-versions-max' = 2,\n       'history.expire.max-snapshot-age-ms' = '60000',\n       'history.expire.min-snapshots-to-keep' = 1,\n       'write.spark.fanout.enabled' = 'true',\n        'write.metadata.metrics.default' = 'none'\n   )\n  AS SELECT /*+ COALESCE(1) */ * from {{tempTableName}}",
            "user": "anonymous",
            "title": "Create Table If Not Exist",
            "config": {
              "title": true,
              "enabled": true,
              "results": {},
              "colWidth": 12,
              "fontSize": 9,
              "editorHide": false,
              "editorMode": "ace/mode/sql",
              "editorSetting": {
                "language": "sql",
                "completionKey": "TAB",
                "editOnDblClick": false,
                "completionSupport": true
              }
            },
            "status": "READY",
            "jobName": "paragraph_1619459617352_1902312980",
            "progress": 0,
            "settings": {
              "forms": {},
              "params": {}
            },
            "$$hashKey": "object:929",
            "dateCreated": "2021-04-26T17:53:37+0000",
            "dateUpdated": "2021-04-26T17:53:37+0000",
            "runtimeInfos": {},
            "progressUpdateIntervalMs": 500
          },
          {
            "id": "paragraph_1616655403846_1323321554",
            "apps": [],
            "text": "%spark\nval tableName = \"{{destinationTableName}}\"\n\nimport org.apache.iceberg.actions.Actions;\nimport org.apache.iceberg.hadoop.HadoopCatalog;\nval catalog = new HadoopCatalog(sc.hadoopConfiguration, sc.getConf.get(\"spark.sql.catalog.cuelake.warehouse\"));\n\nimport org.apache.iceberg.Table;\nimport org.apache.iceberg.catalog.TableIdentifier;\n\nval name = TableIdentifier.of(tableName);\nval table = catalog.loadTable(name);\n\n// Expire older snapshots and commit\ntable.expireSnapshots().commit()\n\n// Run Compaction for table\nActions.forTable(table).rewriteDataFiles()\n    .targetSizeInBytes(500 * 1024 * 1024 * 10) // 5000 MB\n    .execute();",
            "user": "anonymous",
            "title": "Iceberg Table Optimizations",
            "config": {
              "title": true,
              "enabled": true,
              "results": {},
              "colWidth": 12,
              "fontSize": 9,
              "editorMode": "ace/mode/scala",
              "editorSetting": {
                "language": "scala",
                "completionKey": "TAB",
                "editOnDblClick": false,
                "completionSupport": true
              }
            },
            "status": "READY",
            "jobName": "paragraph_1619459617352_1034740601",
            "progress": 0,
            "settings": {
              "forms": {},
              "params": {}
            },
            "$$hashKey": "object:930",
            "dateCreated": "2021-04-26T17:53:37+0000",
            "dateUpdated": "2021-04-26T17:53:37+0000",
            "runtimeInfos": {},
            "progressUpdateIntervalMs": 500
          },
          {
            "id": "paragraph_1615885036622_2137552499",
            "apps": [],
            "text": "%spark.pyspark\nmaxVal=spark.sql(f\"SELECT MAX({{timestampColumn}}) FROM cuelake.{{destinationTableName}}\").collect()[0][0]\nz.put(\"{{tempTableName}}_val\", maxVal)\nprint(\"Timestamp value in target table: \" + str(maxVal))",
            "user": "anonymous",
            "title": "Get Timestamp From Destination Table",
            "config": {
              "title": true,
              "enabled": true,
              "results": {},
              "colWidth": 12,
              "fontSize": 9,
              "editorMode": "ace/mode/python",
              "editorSetting": {
                "language": "python",
                "completionKey": "TAB",
                "editOnDblClick": false,
                "completionSupport": true
              }
            },
            "status": "READY",
            "jobName": "paragraph_1619459617352_975985851",
            "progress": 0,
            "settings": {
              "forms": {},
              "params": {}
            },
            "$$hashKey": "object:931",
            "dateCreated": "2021-04-26T17:53:37+0000",
            "dateUpdated": "2021-04-26T17:53:37+0000",
            "runtimeInfos": {},
            "progressUpdateIntervalMs": 500
          },
          {
            "id": "paragraph_1616144051503_913231705",
            "apps": [],
            "text": "%spark.sql\n-- SQL Query to configure sort order\nALTER TABLE cuelake.{{destinationTableName}} WRITE ORDERED BY {{primaryKeyColumn}};",
            "user": "anonymous",
            "title": "Update Iceberg Table Properties",
            "config": {
              "title": true,
              "enabled": true,
              "results": {},
              "colWidth": 12,
              "fontSize": 9,
              "editorMode": "ace/mode/sql",
              "editorSetting": {
                "language": "sql",
                "completionKey": "TAB",
                "editOnDblClick": false,
                "completionSupport": true
              }
            },
            "status": "READY",
            "jobName": "paragraph_1619459617352_810841844",
            "progress": 0,
            "settings": {
              "forms": {},
              "params": {}
            },
            "$$hashKey": "object:932",
            "dateCreated": "2021-04-26T17:53:37+0000",
            "dateUpdated": "2021-04-26T17:53:37+0000",
            "runtimeInfos": {},
            "progressUpdateIntervalMs": 500
          },
          {
            "id": "paragraph_1616147422888_2080009648",
            "apps": [],
            "text": "%spark.pyspark\nmergeSql = f\"MERGE INTO cuelake.{{destinationTableName}} t USING (SELECT * from {{tempTableName}} where {{timestampColumn}} > \\\"{z.get('{{tempTableName}}_val')}\\\") u ON t.{{primaryKeyColumn}} = u.{{primaryKeyColumn}} WHEN MATCHED THEN UPDATE SET * WHEN NOT MATCHED THEN INSERT *\"\nprint(\"Executing SQL: \" + mergeSql)\nspark.sql(mergeSql)",
            "user": "anonymous",
            "title": "Merge New Data",
            "config": {
              "title": true,
              "enabled": true,
              "results": {},
              "colWidth": 12,
              "fontSize": 9,
              "editorMode": "ace/mode/python",
              "editorSetting": {
                "language": "python",
                "completionKey": "TAB",
                "editOnDblClick": false,
                "completionSupport": true
              }
            },
            "status": "READY",
            "jobName": "paragraph_1619459617353_1235940290",
            "progress": 0,
            "settings": {
              "forms": {},
              "params": {}
            },
            "$$hashKey": "object:933",
            "dateCreated": "2021-04-26T17:53:37+0000",
            "dateUpdated": "2021-04-26T17:53:37+0000",
            "runtimeInfos": {},
            "progressUpdateIntervalMs": 500
          }
        ],
        "angularObjects": {},
        "defaultInterpreterGroup": "spark"
      },
      "formJson": {
        "fields": [
          {
            "name": "sourceConnection",
            "type": "connectionSelect",
            "label": "Source Connection",
            "rules": [
              {
                "message": "Source Connection is required",
                "required": true
              }
            ],
            "filter": [
              "Oracle",
              "Postgres",
              "MySQL"
            ]
          },
          {
            "name": "sqlQuery",
            "type": "sql",
            "label": "SQL Query",
            "rules": [
              {
                "message": "SQL Query is required",
                "required": true
              }
            ]
          },
          {
            "name": "timestampColumn",
            "type": "text",
            "label": "Timestamp Column",
            "rules": [
              {
                "message": "Timestamp Column is required",
                "required": true
              }
            ]
          },
          {
            "name": "primaryKeyColumn",
            "type": "text",
            "label": "Primary Key Column",
            "rules": [
              {
                "message": "Primary Key Column is required",
                "required": true
              }
            ]
          },
          {
            "name": "destinationTableName",
            "type": "text",
            "label": "Destination Table Name",
            "rules": [
              {
                "message": "Destination Table Name is required",
                "required": true
              }
            ]
          }
        ]
      },
      "name": "Incremental Refresh"
    }
  },
  {
    "model": "genie.notebooktemplate",
    "pk": 2,
    "fields": {
      "template": {
        "info": {},
        "name": "{{name}}",
        "path": "/{{name}}",
        "config": {
          "looknfeel": "default",
          "personalizedMode": "false",
          "isZeppelinNotebookCronEnable": false
        },
        "version": "0.9.0",
        "noteForms": {},
        "noteParams": {},
        "paragraphs": [
          {
            "id": "paragraph_1616145245686_1473486751",
            "apps": [],
            "text": "%spark.pyspark\n# ==========================================\n# Connection Properties\n# ==========================================\ndBHost = \"{{sourceConnection_host}}\"\ndBPort = \"{{sourceConnection_port}}\"\ndBSID = \"{{sourceConnection_sid}}\"\ndBName = \"{{sourceConnection_database}}\"\ndBUsername = \"{{sourceConnection_username}}\"\ndBPassword = \"{{sourceConnection_password}}\"\ndBQuery = \"\"\"{{sqlQuery}}\"\"\"\nprimaryColumnName = \"{{primaryKeyColumn}}\"\n# ==========================================\n\nif \"{{sourceConnection_type}}\"==\"Oracle\":\n    df = spark.read \\\n        .format(\"jdbc\") \\\n        .option(\"url\", f\"jdbc:oracle:thin:@{dBHost}:{dBPort}/{dBSID}\") \\\n        .option(\"query\", dBQuery) \\\n        .option(\"password\", dBPassword) \\\n        .option(\"user\", dBUsername) \\\n        .option(\"driver\", \"oracle.jdbc.driver.OracleDriver\") \\\n        .load()\n        \nelif \"{{sourceConnection_type}}\"==\"Postgres\":\n    df = spark.read \\\n        .format(\"jdbc\") \\\n        .option(\"url\", f\"jdbc:postgresql://{dBHost}:{dBPort}/{dBName}\") \\\n        .option(\"query\", dBQuery) \\\n        .option(\"password\", dBPassword) \\\n        .option(\"user\", dBUsername) \\\n        .option(\"driver\", \"org.postgresql.Driver\")\n    if \"{{sourceConnection_sslRequired}}\" == \"True\":\n        df = df.option(\"sslmode\", \"require\")\n    df = df.load()\n    \nelif \"{{sourceConnection_type}}\"==\"MySQL\":\n    df = spark.read \\\n        .format(\"jdbc\") \\\n        .option(\"url\", f\"jdbc:mysql://{dBHost}:{dBPort}/{dBName}\") \\\n        .option(\"query\", dBQuery) \\\n        .option(\"password\", dBPassword) \\\n        .option(\"user\", dBUsername)\n    if \"{{sourceConnection_sslRequired}}\" == \"True\":\n        df = df.option(\"sslmode\", \"require\")\n    df = df.load()\n\n# Store results in temp table\ndf.createOrReplaceTempView(\"{{tempTableName}}\")",
            "user": "anonymous",
            "focus": true,
            "title": "Read Source Database",
            "config": {
              "title": true,
              "enabled": true,
              "results": {},
              "colWidth": 12,
              "fontSize": 9,
              "tableHide": false,
              "editorHide": false,
              "editorMode": "ace/mode/python",
              "editorSetting": {
                "language": "python",
                "completionKey": "TAB",
                "editOnDblClick": false,
                "completionSupport": true
              }
            },
            "status": "READY",
            "jobName": "paragraph_1619471291607_747575162",
            "progress": 0,
            "settings": {
              "forms": {},
              "params": {}
            },
            "$$hashKey": "object:1438",
            "dateCreated": "2021-04-26T21:08:11+0000",
            "dateUpdated": "2021-04-26T21:09:38+0000",
            "runtimeInfos": {},
            "progressUpdateIntervalMs": 500
          },
          {
            "id": "paragraph_1615528549981_865522288",
            "apps": [],
            "text": "%spark.sql\nDROP TABLE IF EXISTS cuelake.{{destinationTableName}}; \nCREATE TABLE cuelake.{{destinationTableName}} USING iceberg AS SELECT /*+ COALESCE(1) */ * from {{tempTableName}}",
            "user": "anonymous",
            "title": "Refresh Destination Table",
            "config": {
              "title": true,
              "enabled": true,
              "results": {},
              "colWidth": 12,
              "fontSize": 9,
              "editorHide": false,
              "editorMode": "ace/mode/sql",
              "editorSetting": {
                "language": "sql",
                "completionKey": "TAB",
                "editOnDblClick": false,
                "completionSupport": true
              }
            },
            "status": "READY",
            "jobName": "paragraph_1619471291608_1787407030",
            "progress": 0,
            "settings": {
              "forms": {},
              "params": {}
            },
            "$$hashKey": "object:1439",
            "dateCreated": "2021-04-26T21:08:11+0000",
            "dateUpdated": "2021-04-26T21:08:11+0000",
            "runtimeInfos": {},
            "progressUpdateIntervalMs": 500
          }
        ],
        "angularObjects": {},
        "defaultInterpreterGroup": "spark"
      },
      "formJson": {
        "fields": [
          {
            "name": "sourceConnection",
            "type": "connectionSelect",
            "label": "Source Connection",
            "rules": [
              {
                "message": "Source Connection is required",
                "required": true
              }
            ],
            "filter": [
              "Oracle",
              "Postgres",
              "MySQL"
            ]
          },
          {
            "name": "sqlQuery",
            "type": "sql",
            "label": "SQL Query",
            "rules": [
              {
                "message": "SQL Query is required",
                "required": true
              }
            ]
          },
          {
            "name": "destinationTableName",
            "type": "text",
            "label": "Destination Table Name",
            "rules": [
              {
                "message": "Destination Table Name is required",
                "required": true
              }
            ]
          }
        ]
      },
      "name": "Full Refresh"
    }
  },
  {
    "model": "genie.notebooktemplate",
    "pk": 3,
    "fields": {
      "template": {
        "info": {
          "isRunning": false
        },
        "name": "{{name}}",
        "config": {
          "looknfeel": "default",
          "personalizedMode": "false",
          "isZeppelinNotebookCronEnable": false
        },
        "version": "0.9.0",
        "noteForms": {},
        "noteParams": {},
        "paragraphs": [
          {
            "apps": [],
            "text": "%spark.sql\nSELECT * FROM",
            "user": "anonymous",
            "focus": true,
            "config": {
              "enabled": true,
              "results": {},
              "colWidth": 12,
              "fontSize": 9,
              "editorMode": "ace/mode/text",
              "editorSetting": {
                "language": "text",
                "completionKey": "TAB",
                "editOnDblClick": false,
                "completionSupport": true
              }
            },
            "status": "READY",
            "settings": {
              "forms": {},
              "params": {}
            },
            "runtimeInfos": {},
            "progressUpdateIntervalMs": 500
          }
        ],
        "angularObjects": {},
        "defaultInterpreterGroup": "spark"
      },
      "formJson": {
        "fields": []
      },
      "name": "Blank"
    }
  }
]