OcuLink Expansion Bay Module

From the EC source code the gpu config on the EEPROM is this:

static struct default_gpu_cfg gpu_cfg = {
        .descriptor = {
                .magic = {0x32, 0xac, 0x00, 0x00},
                .length = sizeof(struct gpu_cfg_descriptor),
                .descriptor_version_major = 0,
                .descriptor_version_minor = 1,
                .hardware_version = 0x0008,
                .hardware_revision = 0,
                .serial = {'F', 'R', 'A', 'K', 'M', 'B', 'C', 'P', '8', '1',
                                        '3', '3', '1', 'A', 'S', 'S', 'Y', '0', '\0', '\0'},
                .descriptor_length = sizeof(struct default_gpu_cfg) - sizeof(struct gpu_cfg_descriptor),
                .descriptor_crc32 = 0,
                .crc32 = 0
        },
        .hdr0 = {.block_type = GPUCFG_TYPE_PCIE, .block_length = sizeof(uint8_t)},
        .pcie_cfg = PCIE_8X1,

        .hdr1 = {.block_type = GPUCFG_TYPE_FAN, .block_length = sizeof(struct gpu_cfg_fan)},
        .fan0_cfg = {.idx = 0, .flags = 0, .min_rpm = 1000, .start_rpm = 1000, .max_rpm = 4700},

        .hdr2 = {.block_type = GPUCFG_TYPE_FAN, .block_length = sizeof(struct gpu_cfg_fan)},
        .fan1_cfg = {.idx = 1, .flags = 0, .min_rpm = 1000, .start_rpm = 1000, .max_rpm = 4500},

        .hdr3 = {.block_type = GPUCFG_TYPE_VENDOR, .block_length = sizeof(gpu_vendor)},
        .vendor = GPU_AMD_R23M,

        .hdr4 = {.block_type = GPUCFG_TYPE_GPIO, .block_length = (sizeof(struct gpu_cfg_gpio) * 7)},
        /* Critical temperature fault input */
        .gpio0 = {.gpio = GPU_1G1_GPIO0_EC, .function = GPIO_FUNC_TEMPFAULT, .flags = GPIO_INPUT, .power_domain = POWER_S3},
        /* DP HPD status from PD */
        .gpio1 = {.gpio = GPU_1H1_GPIO1_EC, .function = GPIO_FUNC_HPD, .flags = GPIO_INPUT, .power_domain = POWER_S5},
        /* AC/DC mode setting */
        .gpio2 = {.gpio = GPU_2A2_GPIO2_EC, .function = GPIO_FUNC_ACDC, .flags = GPIO_OUTPUT_LOW, .power_domain = POWER_S3},
        /* UNUSED */
        .gpio3 = {.gpio = GPU_2L7_GPIO3_EC, .function = GPIO_FUNC_UNUSED, .flags = GPIO_OUTPUT_LOW, .power_domain = POWER_G3},
        /* GPU_VSYS_EN */
        .gpio_vsys = {.gpio = GPU_VSYS_EN, .function = GPIO_FUNC_GPU_PWR, .flags = GPIO_OUTPUT_LOW, .power_domain = POWER_S3},

        .gpio_fan = {.gpio = GPU_FAN_EN, .function = GPIO_FUNC_HIGH, .flags = GPIO_OUTPUT_LOW, .power_domain = POWER_S0},

        .gpu_3v_5v_en = {.gpio = GPU_3V_5V_EN, .function = GPIO_FUNC_HIGH, .flags = GPIO_OUTPUT_LOW, .power_domain = POWER_S5},

        .hdr5 = {.block_type = GPUCFG_TYPE_PD, .block_length = sizeof(struct gpu_subsys_pd)},
        .pd = {.gpu_pd_type = PD_TYPE_ETRON_EJ889I, .address = 0x60,
                        .flags = 0, .pdo = 0, .rdo = 0, .power_domain = POWER_S5,
                        .gpio_hpd = GPU_1H1_GPIO1_EC, .gpio_interrupt = GPU_1F2_I2C_S5_INT
        },

        .hdr6 = {.block_type = GPUCFG_TYPE_THERMAL_SENSOR, .block_length = sizeof(struct gpu_cfg_thermal)},
        .therm = {.thermal_type = GPU_THERM_F75303, .address = 0x4D},

        .hdr7 = {.block_type = GPUCFG_TYPE_CUSTOM_TEMP, .block_length = sizeof(struct gpu_cfg_custom_temp)},
        .custom_temp = {.idx = 2, .temp_fan_off = C_TO_K(47), .temp_fan_max = C_TO_K(62)},

        .hdr8 = {.block_type = GPUCFG_TYPE_SUBSYS, .block_length = sizeof(struct gpu_subsys_serial)},
        .pcba_serial = {.gpu_subsys = GPU_PCB, .serial = {'F', 'R', 'A', 'G', 'M', 'A', 'S', 'P', '8', '1',
                                        '3', '3', '1', 'P', 'C', 'B', '0', '0', '\0', '\0'},}
};

And the NVME/SSD one is this:

static struct default_ssd_cfg ssd_cfg = {
        .descriptor = {
                .magic = {0x32, 0xac, 0x00, 0x00},
                .length = sizeof(struct gpu_cfg_descriptor),
                .descriptor_version_major = 0,
                .descriptor_version_minor = 1,
                .hardware_version = 0x0008,
                .hardware_revision = 0,
                .serial = {'F', 'R', 'A', 'G', 'M', 'B', 'S', 'P', '8', '1',
                                        '3', '3', '1', 'D', 'U', 'M', 'M', 'Y', '\0', '\0'},
                .descriptor_length = sizeof(struct default_ssd_cfg) - sizeof(struct gpu_cfg_descriptor),
                .descriptor_crc32 = 0,
                .crc32 = 0
        },
        .hdr0 = {.block_type = GPUCFG_TYPE_PCIE, .block_length = sizeof(uint8_t)},
        .pcie_cfg = PCIE_4X2,

        .hdr1 = {.block_type = GPUCFG_TYPE_FAN, .block_length = sizeof(struct gpu_cfg_fan)},
        .fan0_cfg = {.idx = 0, .flags = 0, .min_rpm = 1000, .start_rpm = 1000, .max_rpm = 3700},

        .hdr2 = {.block_type = GPUCFG_TYPE_FAN, .block_length = sizeof(struct gpu_cfg_fan)},
        .fan1_cfg = {.idx = 1, .flags = 0, .min_rpm = 1000, .start_rpm = 1000, .max_rpm = 3700},

        .hdr3 = {.block_type = GPUCFG_TYPE_VENDOR, .block_length = sizeof(gpu_vendor)},
        .vendor = GPU_SSD,

        /* Power enable for SSD1 */
        .hdr4 = {.block_type = GPUCFG_TYPE_GPIO, .block_length = sizeof(struct gpu_cfg_gpio) * 8},
        .gpio0 = {.gpio = GPU_1G1_GPIO0_EC, .function = GPIO_FUNC_SSD1_POWER, .flags = GPIO_OUTPUT_LOW, .power_domain = POWER_S3},
        /* Power enable for SSD2 */
        .gpio1 = {.gpio = GPU_1H1_GPIO1_EC, .function = GPIO_FUNC_SSD2_POWER, .flags = GPIO_OUTPUT_LOW, .power_domain = POWER_S3},
        /* UNUSED */
        .gpio2 = {.gpio = GPU_2A2_GPIO2_EC, .function = GPIO_FUNC_UNUSED, .flags = GPIO_OUTPUT_LOW, .power_domain = POWER_G3},
        /* UNUSED */
        .gpio3 = {.gpio = GPU_2L7_GPIO3_EC, .function = GPIO_FUNC_UNUSED, .flags = GPIO_OUTPUT_LOW, .power_domain = POWER_G3},
        /* set mux configuration on mainboard for SSD */
        .gpio_edpaux = {.gpio = GPU_PCIE_MUX_SEL, .function = GPIO_FUNC_HIGH, .flags = GPIO_OUTPUT_LOW, .power_domain = POWER_S3},
        /* GPU_VSYS_EN */
        .gpio_vsys = {.gpio = GPU_VSYS_EN, .function = GPIO_FUNC_HIGH, .flags = GPIO_OUTPUT_LOW, .power_domain = POWER_S3},

        .gpio_fan = {.gpio = GPU_FAN_EN, .function = GPIO_FUNC_HIGH, .flags = GPIO_OUTPUT_LOW, .power_domain = POWER_S0},

        .gpu_3v_5v_en = {.gpio = GPU_3V_5V_EN, .function = GPIO_FUNC_HIGH, .flags = GPIO_OUTPUT_LOW, .power_domain = POWER_S5},
};


well the problem is that it is showing 8 lane so it is recognized. but why does it not actually working?

1 Like

It also looks like it is running at PCIe 1.1… yikes

@Gu_tally
Do you have any modelling software that takes into account the PCB material and the traces you used?
Do you have an oscilloscope / vector analyzer that could give you things like eye diagrams?
It might help you narrow down the problem to whether the problem is matching related or instead just picking up too much digital noise from the surroundings.
It might just need a re-timer or it might need the signalling moved to an intermediate layer to shield it better from noise.

Its always 1.1 when not under load with nvidia. You have to hit the ? Next to the pcie version to toggle to full speed.

@Gu_tally what is not working?

Hey, it looks like an error43 problem which cannot be fixed by running the normal error43 software. Please try this:

"I ran the error43 fixer script just to try it out. It didn’t do anything since no NVIDIA GPU in state 43 was found. After that I had a look into the .bat file and commented the lines 130 + 131 out, like so (lines 129 - 132 below):

:: Check if adapter has error code 43. If not, exit subroutine
:: call devset status “%HW_id%” | findstr “code 43” > nul
:: if errorlevel 1 goto :EOF
set NV_ERR43_FOUND=1

and ran it again. Now it modified the registry anyway. Suddenly the external displays were detected on the eGPU."

I am suggesting it because my GPU-Z looked exactly the same when I first tried to use oculink, so maybe it can help. If it doesn’t help, perhaps consider posting in the forum under the link, (@nando4) could perhaps help there.

3 Likes

It’s most likely a hardware issue, but @Gu_tally I’d recommend that you try this as well. Had to modify this script in the same way in order to have my GPU be properly recognized over Oculink 4i. I’m using an RTX 4090.

2 Likes

That’s a power saving feature usually, the question mark next to it let’s you slightly load it to see where it goes under load.

Fully agree with the hardware point, but the GPU-Z looks like a mirror’s image of what I had before doing this. I’m using an RTX 3090.

And just to note, I think that we are seeing 8 lanes here is a very good sign already.

My only other thought would be to change the GPU to keep PCIe 5.0 out of it (and RTX 5090 driver issues).

1 Like

What pciex16 to Oculink 8i Adapter do you use for the GPU @Gu_tally ?

a 8654 slimsas board that i tested earlier. It is one of the board that have 8i oculink/mcio/slimsas compatible input that have good signal strength. But I am still testing. Will tell everyone where to buy these once I got it all working.

i guess so. I would go ahead and grab the cheapest 2nd hand card from local CeX to test if it is the pcie5.0 issue

1 Like

Does anyone have the balls to disassemble the GPU module to see what it looks like? i guess if i can find and mimic the design for the connection part than i can know where the eeprom should be or how the linking of GPU work?
Or is the PCB design for the GPU module is already on github and i am being blind…

1 Like

Do these help you:

They tell you where to put the eeprom and how to program it.

1 Like

the thing is I want to know what is in the EEPROM in the GPU module. like exactly. if it is the same as the GitHub page than it is a lot easier. but I suspect it is going to be different because of how the difference of design in the past.

Hey @Gu_tally I can help you out here. I ordered an expansion shell and Framework accidentally sent me a graphics bay. They never asked for it back, so you’re welcome to have it to help debug.

DM me your address and I can ship it to ya :+1:

Have you @Gu_tally made sure that PCIe diff pairs are 100% equal length? (I did not see any miter in the picture so thats why I’m asking) Traces are bit long though so retimer might be a good idea. Also impedance is critical, you really need to hit 85.0 ohms of there will be trouble.

even if the signal is not very good, at least it will run at gen3 speed.

Perhaps. But for next PCB revision a little perfectionism is not a bad thing.

@Gu_tally
Maybe if you open source the design files, others in the group here can do some of the design checks for you.