使用 GNU C 内联汇编在 VGA 内存中绘制字符 [英] Drawing a character in VGA memory with GNU C inline assembly

查看:27
本文介绍了使用 GNU C 内联汇编在 VGA 内存中绘制字符的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我正在学习使用 C 和内联汇编在 DOS 中进行一些低级 VGA 编程.现在我正在尝试创建一个在屏幕上打印字符的函数.

I´m learning to do some low level VGA programming in DOS with C and inline assembly. Right now I´m trying to create a function that prints out a character on screen.

这是我的代码:

//This is the characters BITMAPS
uint8_t characters[464] = {
  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x20,0x20,0x20,0x20,0x00,0x20,0x00,0x50,
  0x50,0x00,0x00,0x00,0x00,0x00,0x50,0xf8,0x50,0x50,0xf8,0x50,0x00,0x20,0xf8,0xa0,
  0xf8,0x28,0xf8,0x00,0xc8,0xd0,0x20,0x20,0x58,0x98,0x00,0x40,0xa0,0x40,0xa8,0x90,
  0x68,0x00,0x20,0x40,0x00,0x00,0x00,0x00,0x00,0x20,0x40,0x40,0x40,0x40,0x20,0x00,
  0x20,0x10,0x10,0x10,0x10,0x20,0x00,0x50,0x20,0xf8,0x20,0x50,0x00,0x00,0x20,0x20,
  0xf8,0x20,0x20,0x00,0x00,0x00,0x00,0x00,0x60,0x20,0x40,0x00,0x00,0x00,0xf8,0x00,
  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x60,0x60,0x00,0x00,0x08,0x10,0x20,0x40,0x80,
  0x00,0x70,0x88,0x98,0xa8,0xc8,0x70,0x00,0x20,0x60,0x20,0x20,0x20,0x70,0x00,0x70,
  0x88,0x08,0x70,0x80,0xf8,0x00,0xf8,0x10,0x30,0x08,0x88,0x70,0x00,0x20,0x40,0x90,
  0x90,0xf8,0x10,0x00,0xf8,0x80,0xf0,0x08,0x88,0x70,0x00,0x70,0x80,0xf0,0x88,0x88,
  0x70,0x00,0xf8,0x08,0x10,0x20,0x20,0x20,0x00,0x70,0x88,0x70,0x88,0x88,0x70,0x00,
  0x70,0x88,0x88,0x78,0x08,0x70,0x00,0x30,0x30,0x00,0x00,0x30,0x30,0x00,0x30,0x30,
  0x00,0x30,0x10,0x20,0x00,0x00,0x10,0x20,0x40,0x20,0x10,0x00,0x00,0xf8,0x00,0xf8,
  0x00,0x00,0x00,0x00,0x20,0x10,0x08,0x10,0x20,0x00,0x70,0x88,0x10,0x20,0x00,0x20,
  0x00,0x70,0x90,0xa8,0xb8,0x80,0x70,0x00,0x70,0x88,0x88,0xf8,0x88,0x88,0x00,0xf0,
  0x88,0xf0,0x88,0x88,0xf0,0x00,0x70,0x88,0x80,0x80,0x88,0x70,0x00,0xe0,0x90,0x88,
  0x88,0x90,0xe0,0x00,0xf8,0x80,0xf0,0x80,0x80,0xf8,0x00,0xf8,0x80,0xf0,0x80,0x80,
  0x80,0x00,0x70,0x88,0x80,0x98,0x88,0x70,0x00,0x88,0x88,0xf8,0x88,0x88,0x88,0x00,
  0x70,0x20,0x20,0x20,0x20,0x70,0x00,0x10,0x10,0x10,0x10,0x90,0x60,0x00,0x90,0xa0,
  0xc0,0xa0,0x90,0x88,0x00,0x80,0x80,0x80,0x80,0x80,0xf8,0x00,0x88,0xd8,0xa8,0x88,
  0x88,0x88,0x00,0x88,0xc8,0xa8,0x98,0x88,0x88,0x00,0x70,0x88,0x88,0x88,0x88,0x70,
  0x00,0xf0,0x88,0x88,0xf0,0x80,0x80,0x00,0x70,0x88,0x88,0xa8,0x98,0x70,0x00,0xf0,
  0x88,0x88,0xf0,0x90,0x88,0x00,0x70,0x80,0x70,0x08,0x88,0x70,0x00,0xf8,0x20,0x20,
  0x20,0x20,0x20,0x00,0x88,0x88,0x88,0x88,0x88,0x70,0x00,0x88,0x88,0x88,0x88,0x50,
  0x20,0x00,0x88,0x88,0x88,0xa8,0xa8,0x50,0x00,0x88,0x50,0x20,0x20,0x50,0x88,0x00,
  0x88,0x50,0x20,0x20,0x20,0x20,0x00,0xf8,0x10,0x20,0x40,0x80,0xf8,0x00,0x60,0x40,
  0x40,0x40,0x40,0x60,0x00,0x00,0x80,0x40,0x20,0x10,0x08,0x00,0x30,0x10,0x10,0x10,
  0x10,0x30,0x00,0x20,0x50,0x88,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xf8,
  0x00,0xf8,0xf8,0xf8,0xf8,0xf8,0xf8};
/**************************************************************************
 *  put_char                                                              *
 *     Print char                                                         *
 **************************************************************************/
void put_char(int x ,int y,int ascii_char ,byte color){

    __asm__(
        "push %si
	"
        "push %di
	"
        "push %cx
	"
        "mov color,%dl
	"   //test color
        "mov ascii_char,%al
	"  //test char
        "sub $32,%al
	"
        "mov $7,%ah
	"
        "mul %ah
	"
        "lea $characters,%si
	"
        "add %ax,%si
	"
        "mov $7,%cl
	"
        "0:
	"
        "segCS %lodsb
	"   
        "mov $6,%ch
	"
        "1:
	"    
        "shl $1,%al
	"
        "jnc 2f
	"
        "mov %dl,%ES:(%di)
	"
        "2:
	"
        "inc %di
	"
        "dec %ch
	"
        "jnz 1b
	"
        "add $320-6,%di
	"
        "dec %cl
	"
        "jnz  0b
	"
        "pop %cx
	"
        "pop %di
	"
        "pop %si
	"
        "retn"

    );


}

我正在指导自己学习这一系列用 PASCAL 编写的教程:http://www.joco.homeserver.hu/vgalessons/lesson8.html .

I´m guiding myself from this series of tutorials written in PASCAL: http://www.joco.homeserver.hu/vgalessons/lesson8.html .

我根据 gcc 编译器更改了汇编语法,但仍然出现此错误:

I changed the assembly syntax according to the gcc compiler, but I´m still getting this errors:

Operand mismatch type for 'lea'
No such instruction 'segcs lodsb'
No such instruction 'retn'

<小时>

我一直在努力改进我的代码,至少现在我在屏幕上看到了一些东西.这是我更新的代码:


I have been working on improving my code and at least now I see something on the screen. Here´s my updated code:

/**************************************************************************
 *  put_char                                                              *
 *     Print char                                                         *
 **************************************************************************/
void put_char(int x,int y){
    int char_offset;
    int l,i,j,h,offset;
    j,h,l,i=0;
    offset = (y<<8) + (y<<6) + x;               
    __asm__(

        "movl _VGA, %%ebx;" // VGA memory pointer   
        "addl %%ebx,%%edi;"  //%di points to screen


        "mov _ascii_char,%%al;"
        "sub $32,%%al;"
        "mov $7,%%ah;"
        "mul %%ah;"

        "lea _characters,%%si;"
        "add %%ax,%%si;"   //SI point to bitmap

        "mov $7,%%cl;"

        "0:;"
            "lodsb %%cs:(%%si);"   //load next byte of bitmap 

            "mov $6,%%ch;"
        "1:;"   
            "shl $1,%%al;"
            "jnc 2f;"
            "movb %%dl,(%%edi);"  //plot the pixel
        "2:
	"
            "incl %%edi;"
            "dec %%ch;"
            "jnz 1b;"
            "addl $320-6,%%edi;"
            "dec %%cl;"
            "jnz  0b;"


        :  "=D" (offset)
        : "d" (current_color)

    );


}

如果你看到上面的图片,我正在尝试写字母S".结果是您在屏幕左上角看到的绿色像素.不管 x 和 y 是什么,我给函数它总是在同一个点上绘制像素.

If you see the image above I was trying to write the letter "S". The results are the green pixels that you see on the upper left side of the screen. No matter what x and y I give the functon it always plots the pixels on that same spot.

谁能帮我改正我的代码?

Can anyone help me correct my code?

推荐答案

请参阅下文分析您的 put_char 函数的一些具体问题,以及可能有效的版本.(我不确定 %cs 段覆盖,但除此之外,它应该按照您的意愿行事.

See below for an analysis of some things that are specifically wrong with your put_char function, and a version that might work. (I'm not sure about the %cs segment override, but other than that it should do what you intend).

首先,DOS 和 16 位 x86 已经完全过时了,并不比普通的 64 位 x86 更容易学习.即使 32 位 x86 已经过时,但在 Windows 世界中仍然广泛使用.

First of all, DOS and 16-bit x86 are thoroughly obsolete, and are not easier to learn than normal 64-bit x86. Even 32-bit x86 is obsolete, but still in wide use in the Windows world.

32 位和 64 位代码不必关心许多 16 位限制/复杂性,例如段或寻址模式中的有限寄存器选择.一些现代系统确实使用段覆盖来进行线程本地存储,但学习如何在 16 位代码中使用段几乎与此无关.

32-bit and 64-bit code don't have to care about a lot of 16-bit limitations / complications like segments or limited register choice in addressing modes. Some modern systems do use segment overrides for thread-local storage, but learning how to use segments in 16-bit code is barely connected to that.

了解 asm 的主要好处之一是调试/分析/优化实际程序.如果您想了解如何编写 C 或其他高级代码 查看编译器输出.这将是 64 位(或 32 位).(例如,参见 Matt Godbolt 的 CppCon2017 演讲:我的编译器最近为我做了什么?打开编译器的盖子"total 初学者阅读 x86 asm 以及查看编译器输出的优秀介绍).

One of the major benefits to knowing asm is for debugging / profiling / optimizing real programs. If you want to understand how to write C or other high-level code that can (and actually does) compile to efficient asm, you'll probably be looking at compiler output. This will be 64-bit (or 32-bit). (e.g. see Matt Godbolt's CppCon2017 talk: "What Has My Compiler Done for Me Lately? Unbolting the Compiler's Lid" which has an excellent intro to reading x86 asm for total beginners, and to looking at compiler output).

在查看注释二进制反汇编的性能计数器结果时,Asm 知识很有用 (perf stat ./a.out && perf report -Mintel:参见 Chandler Carruth 的 CppCon2015 演讲:调整 C++:基准、CPU 和编译器!哦,我的天啊!").积极的编译器优化意味着查看每条源代码行的周期/缓存未命中/停顿计数比每条指令的信息要少得多.

Asm knowledge is useful when looking at performance-counter results annotating a disassembly of your binary (perf stat ./a.out && perf report -Mintel: see Chandler Carruth's CppCon2015 talk: "Tuning C++: Benchmarks, and CPUs, and Compilers! Oh My!"). Aggressive compiler optimizations mean that looking at cycle / cache-miss / stall counts per source line are much less informative than per instruction.

此外,为了让您的程序真正任何事情,它必须要么直接与硬件对话,要么进行系统调用.学习文件访问和用户输入的 DOS 系统调用完全是在浪费时间(除了回答关于如何读取和打印 16 位代码中的多位数字的 SO 问题的源源不断的问题).它们与当前主要操作系统中的 API 大不相同.开发新的 DOS 应用程序是没有用的,所以当你到了用你的 asm 知识做某事的阶段时,你必须学习另一个 API(以及 ABI).

Also, for your program to actually do anything, it has to either talk to hardware directly, or make system calls. Learning DOS system calls for file access and user input is a complete waste of time (except for answering the steady stream of SO questions about how to read and print multi-digit numbers in 16-bit code). They're quite different from the APIs in the current major OSes. Developing new DOS applications is not useful, so you'd have to learn another API (as well as ABI) when you get to the stage of doing something with your asm knowledge.

在 8086 模拟器上学习 asm 的限制更大:186、286 和 386 添加了许多方便的指令,例如 imul ecx, 15,使 ax 不那么特殊";.将自己限制为仅适用于 8086 的指令意味着您会发现坏"的做事的方法.其他较大的有 movzx/movsx,按立即计数(非 1)移位,以及 push 立即.除了性能之外,当这些可用时编写代码也更容易,因为您不必编写循环来移位超过 1 位.

Learning asm on an 8086 simulator is even more limiting: 186, 286, and 386 added many convenient instructions like imul ecx, 15, making ax less "special". Limiting yourself to only instructions that work on 8086 means you'll figure out "bad" ways to do things. Other big ones are movzx / movsx, shift by an immediate count (other than 1), and push immediate. Besides performance, it's also easier to write code when these are available, because you don't have to write a loop to shift by more than 1 bit.

我主要是通过阅读编译器输出来学习 asm,然后进行一些小的更改.当我并不真正理解事物时,我并没有尝试在 asm 中编写东西,但是如果您要快速学习(而不仅仅是在调试/分析 C 时发展理解),您可能需要通过以下方式测试您的理解编写自己的代码.您确实需要了解基础知识,即有 8 或 16 个整数寄存器 + 标志和指令指针,并且每条指令都对机器的当前架构状态进行了明确定义的修改.(有关每条指令的完整说明,请参阅英特尔 insn 参考手册( wiki,以及更多好东西).

I mostly learned asm from reading compiler output, then making small changes. I didn't try to write stuff in asm when I didn't really understand things, but if you're going to learn quickly (rather than just evolve an understanding while debugging / profiling C), you probably need to test your understanding by writing your own code. You do need to understand the basics, that there are 8 or 16 integer registers + the flags and instruction pointer, and that every instruction makes a well-defined modification to the current architectural state of the machine. (See the Intel insn ref manual for complete descriptions of every instruction (links in the x86 wiki, along with much more good stuff).

您可能想从简单的事情开始,例如在 asm 中编写单个函数,作为更大程序的一部分.了解进行系统调用所需的 asm 类型很有用,但在实际程序中,通常只有为不涉及任何系统调用的内部循环手写 asm 才有用.编写 asm 来读取输入和打印结果是很耗时的,所以我建议用 C 来做这部分.确保你阅读了编译器输出并理解发生了什么,以及整数和字符串之间的区别,以及什么strtolprintf 可以,即使您不是自己编写的.

You might want to start with simple things like writing a single function in asm, as part of a bigger program. Understanding the kind of asm needed to make system calls is useful, but in real programs it's normally only useful to hand-write asm for inner loops that don't involve any system calls. It's time-consuming to write asm to read input and print results, so I'd suggest doing that part in C. Make sure you read the compiler output and understand what's going on, and the difference between an integer and a string, and what strtol and printf do, even if you don't write them yourself.

一旦您认为自己对基础知识了解得足够多,请在您熟悉和/或感兴趣的某个程序中找到一个函数,然后看看您是否可以击败编译器并保存指令(或使用更快的指令).或者自己实现它而无需使用编译器输出作为起点,无论您觉得哪个更有趣.这个答案可能很有趣,尽管重点是找到让编译器生成最佳 ASM 的 C 源代码.

Once you think you understand enough of the basics, find a function in some program you're familiar with and/or interested in, and see if you can beat the compiler and save instructions (or use faster instructions). Or implement it yourself without using the compiler output as a starting point, whichever you find more interesting. This answer might be interesting, although the focus there was finding C source that got the compiler to produce the optimal ASM.

人们问我如何在 asm 中做 X"有很多 SO 问题,答案通常是和你在 C 中做的一样".不要沉迷于不熟悉的 asm,以至于忘记了如何编程.弄清楚函数操作的数据需要发生什么,然后弄清楚如何在 asm 中做到这一点.如果您遇到困难并且不得不提出问题,那么您应该拥有大部分可行的实施方案,只有一部分您不知道在一个步骤中使用什么说明.

There are many SO questions from people asking "how do I do X in asm", and the answer is usually "the same as you would in C". Don't get so caught up in asm being unfamiliar that you forget how to program. Figure out what needs to happen to the data the function operates on, then figure out how to do that in asm. If you get stuck and have to ask a question, you should have most of a working implementation, with just one part that you don't know what instructions to use for one step.

您应该在 32 位或 64 位 x86 上执行此操作.我建议使用 64 位,因为 ABI 更好,但 32 位函数将迫使您更多地使用堆栈.所以这可能会帮助您了解 call 指令如何将返回地址放在堆栈上,以及调用者实际推送的参数在此之后的位置.(这似乎是您试图通过使用内联 asm 来避免处理的问题).

You should do this with 32 or 64bit x86. I'd suggest 64bit, since the ABI is nicer, but 32bit functions will force you to make more use of the stack. So that might help you understand how a call instruction puts the return address on the stack, and where the args the caller pushed actually are after that. (This appears to be what you tried to avoid dealing with by using inline asm).

通过直接修改视频 RAM 来学习如何做图形没有用,除了满足对计算机过去如何工作的好奇心之外.你不能将这些知识用于任何事情.现代图形 API 的存在是为了让多个程序在它们自己的屏幕区域中绘制,并允许间接绘制(例如,在纹理上而不是直接在屏幕上绘制,因此 3D 窗口翻转 alt-tab 看起来很漂亮).不直接在视频 RAM 上绘图的原因太多了.

Learning how to do graphics by directly modifying video RAM is not useful, other than to satisfy curiosity about how computers used to work. You can't use that knowledge for anything. Modern graphics APIs exist to let multiple programs draw in their own regions of the screen, and to allow indirection (e.g. draw on a texture instead of the screen directly, so 3D window-flipping alt-tab can look fancy). There too many reasons to list here for not drawing directly on video RAM.

在像素图缓冲区上绘图,然后使用图形 API 将其复制到屏幕上是可能的.尽管如此,做位图图形或多或少已经过时了,除非您正在为 PNG 或 JPEG 或其他东西生成图像(例如,优化将直方图箱转换为 Web 服务后端代码中的散点图).现代图形 API 抽象了分辨率,因此无论每个像素有多大,您的应用程序都可以以合理的大小绘制内容.(小但极高的分辨率屏幕与低分辨率的大电视).

Drawing on a pixmap buffer and then using a graphics API to copy it to the screen is possible. Still, doing bitmap graphics at all is more or less obsolete, unless you're generating images for PNG or JPEG or something (e.g. optimize converting histogram bins to a scatter plot in the back-end code for a web service). Modern graphics APIs abstract away the resolution, so your app can draw things at a reasonable size regardless of how big each pixel is. (small but extremely high rez screen vs. big TV at low rez).

写入内存并在屏幕上看到一些变化是很酷的.或者更好的是,将 LED(带有小电阻)连接到并行端口上的数据位,然后运行 ​​outb 指令来打开/关闭它们.我很久以前在我的 Linux 系统上这样做过.我制作了一个使用 iopl(2) 和内联 asm 的小包装程序,并以 root 身份运行它.你可能可以在 Windows 上做类似的事情.您不需要 DOS 或 16 位代码来与硬件交谈.

It is kind of cool to write to memory and see something change on-screen. Or even better, hook up LEDs (with small resistors) to the data bits on a parallel port, and run an outb instruction to turn them on/off. I did this on my Linux system ages ago. I made a little wrapper program that used iopl(2) and inline asm, and ran it as root. You can probably do similar on Windows. You don't need DOS or 16bit code to get your feet wet talking to the hardware.

in/out 指令,以及对内存映射 IO 和 DMA 的正常加载/存储,是真正的驱动程序与硬件对话的方式,包括比硬件复杂得多的事情.并行端口.了解您的硬件如何真正地"运行很有趣.工作,但只有在您真正感兴趣或想要编写驱动程序时才花时间.Linux 源代码树包括用于大量硬件的驱动程序,并且经常得到很好的注释,所以如果您喜欢阅读代码和编写代码一样多,这是另一种方式来了解读取驱动程序在与硬件交谈时所做的事情.

in/out instructions, and normal loads/stores to memory-mapped IO, and DMA, are how real drivers talk to hardware, including things far more complicated than parallel ports. It's fun to know how your hardware "really" works, but only spend time on it if you're actually interested, or want to write drivers. The Linux source tree includes drivers for boatloads of hardware, and is often well commented, so if you like reading code as much as writing code, that's another way to get a feel for what read drivers do when they talk to hardware.

了解事情的内幕通常是好的.如果您了解图形在很久以前是如何工作的(使用 VGA 文本模式和颜色/属性字节),那么当然要发疯了.请注意,现代操作系统不使用 VGA 文本模式,因此您甚至没有了解现代计算机上发生的事情.

It's generally good to have some idea how things work under the hood. If you want to learn about how graphics used to work ages ago (with VGA text mode and color / attribute bytes), then sure, go nuts. Just be aware that modern OSes don't use VGA text mode, so you aren't even learning what happens under the hood on modern computers.

许多人喜欢https://retrocomputing.stackexchange.com/,重温计算机不那么复杂且无法实现的简单时光't 支持尽可能多的抽象层.请注意,这就是您正在做的事情.我可能是学习为现代硬件编写驱动程序的一个很好的垫脚石,如果您确定 这就是您想要了解 asm/硬件的原因.

Many people enjoy https://retrocomputing.stackexchange.com/, reliving a simpler time when computers were less complex and couldn't support as many layers of abstraction. Just be aware that's what you're doing. I might be a good stepping stone to learning to write drivers for modern hardware, if you're sure that's why you want to understand asm / hardware.

您使用内联 ASM 的方法完全不正确.您似乎想在 asm 中编写整个函数,所以您应该那样做.例如将您的代码放在 asmfuncs.S 或其他东西中.如果您想继续使用 GNU/AT&T 语法,请使用 .S;或者使用 .asm 如果你想使用 Intel/NASM/YASM 语法(我会推荐,因为官方手册都使用 Intel 语法.请参阅 维基指南和手册.)

You are taking a totally incorrect approach to using inline ASM. You seem to want to write whole functions in asm, so you should just do that. e.g. put your code in asmfuncs.S or something. Use .S if you want to keep using GNU / AT&T syntax; or use .asm if you want to use Intel / NASM / YASM syntax (which I would recommend, since the official manuals all use Intel syntax. See the x86 wiki for guides and manuals.)

GNU 内联汇编是最难学习 ASM 的方法.您必须了解 asm 所做的一切,以及编译器需要了解的内容.真的很难把一切都做好.例如,在您的编辑中,该内联 asm 块修改了许多您没有列为损坏的寄存器,包括 %ebx 这是一个调用保留寄存器(因此即使该函数也会破坏没有内联).至少你去掉了 ret,所以当编译器将此函数内联到调用它的循环中时,事情不会那么严重.如果这听起来真的很复杂,那是因为它确实如此,这也是您不应该使用内联 asm 来学习 asm 的部分原因.

GNU inline asm is the hardest way to learn ASM. You have to understand everything that your asm does, and what the compiler needs to know about it. It's really hard to get everything right. For example, in your edit, that block of inline asm modifies many registers that you don't list as clobbered, including %ebx which is a call-preserved register (so this is broken even if that function isn't inlined). At least you took out the ret, so things won't break as spectacularly when the compiler inlines this function into the loop that calls it. If that sounds really complicated, that's because it is, and part of why you shouldn't use inline asm to learn asm.

这个对滥用内联的类似问题的回答asm 在尝试学习 asm 时 有更多关于内联 asm 以及如何很好地使用它的链接.

This answer to a similar question from misusing inline asm while trying to learn asm in the first place has more links about inline asm and how to use it well.

这部分可能是一个单独的答案,但我会把它放在一起.

This part could be a separate answer, but I'll leave it together.

除了您的整个方法从根本上来说是个坏主意之外,您的 put_char 函数至少存在一个特定问题:您使用 offset 作为仅输出操作数.gcc 很高兴地将您的整个函数编译为单个 ret 指令,因为 asm 语句不是 volatile,并且不使用其输出.(假设没有输出的内联 asm 语句是 volatile.)

Besides your whole approach being fundamentally a bad idea, there is at least one specific problem with your put_char function: you use offset as an output-only operand. gcc quite happily compiles your whole function to a single ret instruction, because the asm statement isn't volatile, and its output isn't used. (Inline asm statements without outputs are assumed to be volatile.)

把你的功能上godbolt ,所以我可以看看汇编编译器生成的周围.该链接是固定的可能工作版本,具有正确声明的破坏、评论、清理和优化.如果外部链接中断,请参阅下面的相同代码.

I put your function on godbolt, so I could look at what assembly the compiler generates surrounding it. That link is to the fixed maybe-working version, with correctly-declared clobbers, comments, cleanups, and optimizations. See below for the same code, if that external link ever breaks.

我使用带有 -m16 选项的 gcc 5.3,这与使用真正的 16 位编译器不同.它仍然以 32 位方式执行所有操作(使用 32 位地址、32 位 ints 和堆栈上的 32 位函数参数),但告诉汇编器 CPU 将处于 16 位模式,因此它会知道何时发出操作数大小和地址大小前缀.

I used gcc 5.3 with the -m16 option, which is different from using a real 16bit compiler. It still does everything the 32bit way (using 32bit addresses, 32bit ints, and 32bit function args on the stack), but tells the assembler that the CPU will be in 16bit mode, so it will know when to emit operand-size and address-size prefixes.

即使您使用-O0编译您的原始版本,编译器计算 offset = (y<<8) + (y<<6) + x;,但没有把它放在 %edi 中,因为你没有不要问它.将其指定为另一个输入操作数会起作用.内联汇编之后,它将 %edi 存储到 -12(%ebp),其中 offset 所在的位置.

Even if you compile your original version with -O0, the compiler computes offset = (y<<8) + (y<<6) + x;, but doesn't put it in %edi, because you didn't ask it to. Specifying it as another input operand would have worked. After the inline asm, it stores %edi into -12(%ebp), where offset lives.

put_char 的其他错误:

您通过全局变量而不是函数参数将 2 个东西(ascii_charcurrent_color)传递到您的函数中.咳咳,真恶心.VGAcharacters 是常量,因此从全局变量加载它们看起来并不那么糟糕.用 asm 编写意味着只有当良好的编码实践对性能有合理的帮助时,你才应该忽略它.由于调用者可能不得不将这些值存储到全局变量中,与调用者将它们作为函数 args 存储在堆栈中相比,您并没有保存任何东西.而对于 x86-64,您将失去性能,因为调用者可以将它们传递到寄存器中.

You pass 2 things (ascii_char and current_color) into your function through globals, instead of function arguments. Yuck, that's disgusting. VGA and characters are constants, so loading them from globals doesn't look so bad. Writing in asm means you should ignore good coding practices only when it helps performance by a reasonable amount. Since the caller probably had to store those values into the globals, you're not saving anything compared to the caller storing them on the stack as function args. And for x86-64, you'd be losing perf because the caller could just pass them in registers.

还有:

j,h,l,i=0;  // sets i=0, does nothing to j, h, or l.
       // gcc warns: left-hand operand of comma expression has no effect
j;h;l;i=0;  // equivalent to this

j=h=l=i=0;  // This is probably what you meant

除了offset之外,所有的局部变量都没有使用.你打算用 C 或其他语言编写它吗?

All the local variables are unused anyway, other than offset. Were you going to write it in C or something?

characters 使用 16 位地址,VGA 内存使用 32 位寻址模式.我认为这是故意的,但我不知道它是否正确.另外,您确定应该对 characters 的加载使用 CS: 覆盖吗?.rodata 部分是否进入代码段?尽管您没有将 uint8_t characters[464] 声明为 const,因此它可能只是在 .data 部分中.我认为自己很幸运,因为我实际上没有为分段内存模型编写代码,但这看起来仍然很可疑.

You use 16bit addresses for characters, but 32bit addressing modes for VGA memory. I assume this is intentional, but I have no idea if it's correct. Also, are you sure you should use a CS: override for the loads from characters? Does the .rodata section go into the code segment? Although you didn't declare uint8_t characters[464] as const, so it's probably just in the .data section anyway. I consider myself fortunate that I haven't actually written code for a segmented memory model, but that still looks suspicious.

如果您真的在使用 djgpp,那么根据 Michael Petch 的评论,您的代码将在 32 位模式下运行.因此,使用 16 位地址是个坏主意.

If you're really using djgpp, then according to Michael Petch's comment, your code will run in 32bit mode. Using 16bit addresses is thus a bad idea.

这样做可以完全避免使用 %ebx,而不是加载到 ebx 中,然后将 %ebx 添加到 %edi.

You can avoid using %ebx entirely by doing this, instead of loading into ebx and then adding %ebx to %edi.

 "add    _VGA, %%edi
	"   // load from _VGA, add to edi.

您不需要 lea 将地址存入寄存器.你可以使用

You don't need lea to get an address into a register. You can just use

    "mov    %%ax, %%si
	"
    "add    $_characters, %%si
	"

$_characters 表示地址为立即数.通过将其与之前计算的偏移量结合到位图 characters 数组中,我们可以节省大量指令.imul 的立即操作数形式让我们首先在 %si 中产生结果:

$_characters means the address as an immediate constant. We can save a lot of instructions by combining this with the previous calculation of the offset into the characters array of bitmaps. The immediate-operand form of imul lets us produce the result in %si in the first place:

    "movzbw _ascii_char,%%si
	"
       //"sub    $32,%%ax
	"      // AX = ascii_char - 32
    "imul   $7, %%si, %%si
	"
    "add    $(_characters - 32*7), %%si
	"  // Do the -32 at the same time as adding the table address, after multiplying
    // SI points to characters[(ascii_char-32)*7]
    // i.e. the start of the bitmap for the current ascii character.

由于这种形式的 imul 只保留了 16*16 的低 16b ->32b 乘法,2 和 3 操作数形式 imul 可用于有符号或无符号乘法,这就是为什么只有 imul(不是 mul)有那些额外的形式.对于更大的操作数大小的乘法,2 和 3 操作数 imul 更快,因为它不必将高半部分存储在 %[er]dx 中.

Since this form of imul only keeps the low 16b of the 16*16 -> 32b multiply, the 2 and 3 operand forms imul can be used for signed or unsigned multiplies, which is why only imul (not mul) has those extra forms. For larger operand-size multiplies, 2 and 3 operand imul is faster, because it doesn't have to store the high half in %[er]dx.

您可以稍微简化内部循环,但会使外部循环稍微复杂化:您可以在零标志上进行分支,如 shl $1, %al 所设置,而不是使用计数器.这也会使它变得不可预测,比如非前景像素的跳过存储,因此增加的分支错误预测可能比额外的无所作为循环更糟糕.这也意味着您每次都需要在外循环中重新计算 %edi ,因为内循环不会运行恒定的次数.但它可能看起来像:

You could simplify the inner loop a bit, but it would complicate the outer loop slightly: you could branch on the zero flag, as set by shl $1, %al, instead of using a counter. That would make it also unpredictable, like the jump over store for non-foreground pixels, so the increased branch mispredictions might be worse than the extra do-nothing loops. It would also mean you'd need to recalculate %edi in the outer loop each time, because the inner loop wouldn't run a constant number of times. But it could look like:

    ... same first part of the loop as before
    // re-initialize %edi to first_pixel-1, based on outer-loop counter
    "lea  -1(%%edi), %%ebx
"
    ".Lbit_loop:
	"      // map the 1bpp bitmap to 8bpp VGA memory
        "incl   %%ebx
	"       // inc before shift, to preserve flags
        "shl    $1,%%al
	"
        "jnc    .Lskip_store
	"   // transparency: only store on foreground pixels
        "movb   %%dl,(%%ebx)
"  //plot the pixel
    ".Lskip_store:
	"
        "jnz  .Lbit_loop
	"    // flags still set from shl

        "addl   $320,%%edi
	"  // WITHOUT the -6
        "dec    %%cl
	"
        "jnz  .Lbyte_loop
	"

请注意,字符位图中的位将映射到 VGA 内存中的字节,例如 {7 6 5 4 3 2 1 0},因为您正在测试移出的位移.所以它从 MSB 开始.寄存器中的位总是大端".即使在像 x86 这样的小端机器上,左移也会乘以 2.小端仅影响内存中字节的顺序,而不影响字节中的位,甚至不影响寄存器中的字节.

Note that the bits in your character bitmaps are going to map to bytes in VGA memory like {7 6 5 4 3 2 1 0}, because you're testing the bit shifted out by a left shift. So it starts with the MSB. Bits in a register are always "big endian". A left shift multiplies by two, even on a little-endian machine like x86. Little-endian only affects ordering of bytes in memory, not bits in a byte, and not even bytes inside registers.

这与 Godbolt 链接相同.

This is the same as the godbolt link.

void put_char(int x,int y){
    int offset = (y<<8) + (y<<6) + x;
    __asm__ volatile (  // volatile is implicit for asm statements with no outputs, but better safe than sorry.

        "add    _VGA, %%edi
	" // edi points to VGA + offset.

        "movzbw _ascii_char,%%si
	"   // Better: use an input operand

        //"sub    $32,%%ax
	"      // AX = ascii_char - 32
        "imul   $7, %%si, %%si
	"     // can't fold the load into this because it's not zero-padded
        "add    $(_characters - 32*7), %%si
	"  // Do the -32 at the same time as adding the table address, after multiplying
        // SI points to characters[(ascii_char-32)*7]
        // i.e. the start of the bitmap for the current ascii character.

        "mov    $7,%%cl
"

        ".Lbyte_loop:
	"
            "lodsb  %%cs:(%%si)
	"   //load next byte of bitmap 

            "mov    $6,%%ch
"
        ".Lbit_loop:
	"      // map the 1bpp bitmap to 8bpp VGA memory
            "shl    $1,%%al
	"
            "jnc    .Lskip_store
	"   // transparency: only store on foreground pixels
            "movb   %%dl,(%%edi)
"  //plot the pixel
        ".Lskip_store:
	"
            "incl   %%edi
	"
            "dec    %%ch
	"
            "jnz  .Lbit_loop
	"

            "addl   $320-6,%%edi
	"
            "dec    %%cl
	"
            "jnz  .Lbyte_loop
	"


        : "+&D" (offset)        // EDI modified by the asm, compiler needs to know that, so use a read-write "+" input.  Early-clobber "&" because we read the other input after modifying this.
        : "d" (current_color)  // used read-only
        : "%eax", "%ecx", "%esi", "memory"
         // omit the memory clobber if your C never touches VGA memory, and your asm never loads/stores anywhere else.
         // but that's not the case here: the asm loads from memory written by C
         // without listing it as a memory operand (even a pointer in a register isn't sufficient)
         // so gcc might optimize away "dead" stores to it, or reorder the asm with loads/stores to it.    
    );
}

Re:"memory" 破坏者,参见 如何指示可以使用内联 ASM 参数*指向*的内存?

Re: the "memory" clobber, see How can I indicate that the memory *pointed* to by an inline ASM argument may be used?

我没有使用虚拟输出操作数来让寄存器分配由编译器自行决定,但这是一个好主意,可以减少在正确位置获取数据以进行内联汇编的开销.(额外的 mov 指令).例如,这里不需要强制编译器将 offset 放在 %edi 中.它可能是我们尚未使用的任何寄存器.

I didn't use dummy output operands to leave register allocation up to the compiler's discretion, but that's a good idea to reduce the overhead of getting data in the right places for inline asm. (extra mov instructions). For example, here there was no need to force the compiler to put offset in %edi. It could have been any register we aren't already using.

这篇关于使用 GNU C 内联汇编在 VGA 内存中绘制字符的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆