The steps for this recipe are as follows:
-
Import the libraries:
import pandas as pd
import numpy as np
from pyspark.sql.types import *
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score
import pickle
import mlflow
- Import the data:
file_location = "/FileStore/tables/train_FD001.txt"
file_type = "csv"
schema = StructType([
StructField("engine_id", IntegerType()),
StructField("cycle", IntegerType()),
StructField("setting1", DoubleType()),
StructField("setting2", DoubleType()),
StructField("setting3", DoubleType()),
StructField("s1", DoubleType()),
StructField("s2", DoubleType()),
StructField("s3", DoubleType()),
StructField("...